knitr::opts_chunk$set(warning = FALSE, message = FALSE)
# # initiate git repository
# system('git config --global user.email "shannon.j.oleary@gmail.com"')
# system('git config --global user.name "shannon"')
source("scr/libraries.R")
source("scr/ggplot.R")
source("scr/VCFfilterstats.R")
source("scr/HaplotypR.R")
source("scr/xtrafunctions.R")
source("scr/genind.R")Make sure index is in the file name of the downloaded sequences before demultiplexing; input file barcode then index; demultiplex.txt is in UNIX format.
Parse process radtags log file.
# if already run unattach list
rm(l)
# create empty list
l <- list()
# SFL-4 ----
# number of samples in index
index <- "CGATGT"
lib <- "SFL4"
n_samples <- 44
# read in summary per barcode
l[["SFL4-2"]] <- read_table2("data/SEQ/SFL4_CGATGT_radtags.log",
skip = 13, n_max = n_samples,
col_names = c("BARCODE", "TOTAL_READS", "AMBIG_READS", "LQ_READS", "RETAINED")) %>%
mutate(PROP_RETAINED = RETAINED/TOTAL_READS,
INDEX = index,
LIBRARY = lib) %>%
select(LIBRARY, INDEX, BARCODE, PROP_RETAINED, TOTAL_READS, RETAINED, AMBIG_READS, LQ_READS)
# number of samples in index
index <- "TGACCA"
lib <- "SFL4"
n_samples <- 44
# read in summary per barcode
l[["SFL4-4"]] <- read_table2("data/SEQ/SFL4_TGACCA_radtags.log",
skip = 13, n_max = n_samples,
col_names = c("BARCODE", "TOTAL_READS", "AMBIG_READS", "LQ_READS", "RETAINED")) %>%
mutate(PROP_RETAINED = RETAINED/TOTAL_READS,
INDEX = index,
LIBRARY = lib) %>%
select(LIBRARY, INDEX, BARCODE, PROP_RETAINED, TOTAL_READS, RETAINED, AMBIG_READS, LQ_READS)
# number of samples in index
index <- "CAGATC"
lib <- "SFL4"
n_samples <- 48
# read in summary per barcode
l[["SFL4-7"]] <- read_table2("data/SEQ/SFL4_CAGATC_radtags.log",
skip = 13, n_max = n_samples,
col_names = c("BARCODE", "TOTAL_READS", "AMBIG_READS", "LQ_READS", "RETAINED")) %>%
mutate(PROP_RETAINED = RETAINED/TOTAL_READS,
INDEX = index,
LIBRARY = lib) %>%
select(LIBRARY, INDEX, BARCODE, PROP_RETAINED, TOTAL_READS, RETAINED, AMBIG_READS, LQ_READS)
# number of samples in index
index <- "TAGCTT"
lib <- "SFL4"
n_samples <- 46
# read in summary per barcode
l[["SFL4-10"]] <- read_table2("data/SEQ/SFL4_TAGCTT_radtags.log",
skip = 13, n_max = n_samples,
col_names = c("BARCODE", "TOTAL_READS", "AMBIG_READS", "LQ_READS", "RETAINED")) %>%
mutate(PROP_RETAINED = RETAINED/TOTAL_READS,
INDEX = index,
LIBRARY = lib) %>%
select(LIBRARY, INDEX, BARCODE, PROP_RETAINED, TOTAL_READS, RETAINED, AMBIG_READS, LQ_READS)
# SFL-5 ----
# number of samples in index
index <- "CGATGT"
lib <- "SFL4"
n_samples <- 48
# read in summary per barcode
l[["SFL5-2"]] <- read_table2("data/SEQ/SFL5_CGATGT_radtags.log",
skip = 13, n_max = n_samples,
col_names = c("BARCODE", "TOTAL_READS", "AMBIG_READS", "LQ_READS", "RETAINED")) %>%
mutate(PROP_RETAINED = RETAINED/TOTAL_READS,
INDEX = index,
LIBRARY = lib) %>%
select(LIBRARY, INDEX, BARCODE, PROP_RETAINED, TOTAL_READS, RETAINED, AMBIG_READS, LQ_READS)
# number of samples in index
index <- "TGACCA"
lib <- "SFL5"
n_samples <- 47
# read in summary per barcode
l[["SFL5-4"]] <- read_table2("data/SEQ/SFL5_TGACCA_radtags.log",
skip = 13, n_max = n_samples,
col_names = c("BARCODE", "TOTAL_READS", "AMBIG_READS", "LQ_READS", "RETAINED")) %>%
mutate(PROP_RETAINED = RETAINED/TOTAL_READS,
INDEX = index,
LIBRARY = lib) %>%
select(LIBRARY, INDEX, BARCODE, PROP_RETAINED, TOTAL_READS, RETAINED, AMBIG_READS, LQ_READS)
# number of samples in index
index <- "CAGATC"
lib <- "SFL5"
n_samples <- 44
# read in summary per barcode
l[["SFL5-7"]] <- read_table2("data/SEQ/SFL5_CAGATC_radtags.log",
skip = 13, n_max = n_samples,
col_names = c("BARCODE", "TOTAL_READS", "AMBIG_READS", "LQ_READS", "RETAINED")) %>%
mutate(PROP_RETAINED = RETAINED/TOTAL_READS,
INDEX = index,
LIBRARY = lib) %>%
select(LIBRARY, INDEX, BARCODE, PROP_RETAINED, TOTAL_READS, RETAINED, AMBIG_READS, LQ_READS)
# number of samples in index
index <- "TAGCTT"
lib <- "SFL5"
n_samples <- 44
# read in summary per barcode
l[["SFL5-10"]] <- read_table2("data/SEQ/SFL5_TAGCTT_radtags.log",
skip = 13, n_max = n_samples,
col_names = c("BARCODE", "TOTAL_READS", "AMBIG_READS", "LQ_READS", "RETAINED")) %>%
mutate(PROP_RETAINED = RETAINED/TOTAL_READS,
INDEX = index,
LIBRARY = lib) %>%
select(LIBRARY, INDEX, BARCODE, PROP_RETAINED, TOTAL_READS, RETAINED, AMBIG_READS, LQ_READS)
# SFL-6 ----
# number of samples in index
index <- "CGATGT"
lib <- "SFL6"
n_samples <- 48
# read in summary per barcode
l[["SFL6-2"]] <- read_table2("data/SEQ/SFL6_CGATGT_radtags.log",
skip = 13, n_max = n_samples,
col_names = c("BARCODE", "TOTAL_READS", "AMBIG_READS", "LQ_READS", "RETAINED")) %>%
mutate(PROP_RETAINED = RETAINED/TOTAL_READS,
INDEX = index,
LIBRARY = lib) %>%
select(LIBRARY, INDEX, BARCODE, PROP_RETAINED, TOTAL_READS, RETAINED, AMBIG_READS, LQ_READS)
# number of samples in index
index <- "TGACCA"
lib <- "SFL6"
n_samples <- 47
# read in summary per barcode
l[["SFL6-4"]] <- read_table2("data/SEQ/SFL6_TGACCA_radtags.log",
skip = 12, n_max = n_samples,
col_names = c("BARCODE", "TOTAL_READS", "AMBIG_READS", "LQ_READS", "RETAINED")) %>%
mutate(PROP_RETAINED = RETAINED/TOTAL_READS,
INDEX = index,
LIBRARY = lib) %>%
select(LIBRARY, INDEX, BARCODE, PROP_RETAINED, TOTAL_READS, RETAINED, AMBIG_READS, LQ_READS)
# number of samples in index
index <- "CAGATC"
lib <- "SFL6"
n_samples <- 48
# read in summary per barcode
l[["SFL6-7"]] <- read_table2("data/SEQ/SFL6_CAGATC_radtags.log",
skip = 13, n_max = n_samples,
col_names = c("BARCODE", "TOTAL_READS", "AMBIG_READS", "LQ_READS", "RETAINED")) %>%
mutate(PROP_RETAINED = RETAINED/TOTAL_READS,
INDEX = index,
LIBRARY = lib) %>%
select(LIBRARY, INDEX, BARCODE, PROP_RETAINED, TOTAL_READS, RETAINED, AMBIG_READS, LQ_READS)
# create single data frame
radtagslog <- ldply(l, data.frame) %>%
select(-`.id`, LQ_READS) %>%
unite(LIB_IDX, LIBRARY, INDEX, sep = "_", remove = FALSE)
write_delim(radtagslog, "results/all.radtags.log")Compare demultiplexed reads per library & index.
Plot distributions of total and proportion of retained reads.
radtagslog %>%
select(LIB_IDX, LIBRARY, INDEX, BARCODE, PROP_RETAINED, TOTAL_READS) %>%
gather(key = STAT, value = READS, 5:6) %>%
ggplot(aes(x = READS)) +
geom_histogram(color = "black", fill = "darkorange") +
labs(x = "reads") +
facet_grid(LIB_IDX ~ STAT, scales = "free") +
theme_standardUse same reference as for linkage map for southern flounder.
Values chosen for MiSeq Reference:
Transfer copy of reference.fasta and associated files for K1 = 2 and K2 = 1 into each directory containing demultiplexed and quality trimmed sequences
Any renaming of files needs to happen before fastq-files are mapped. The population designation (before underscore) is used by FreeBayes to call SNPs so it is important that population designation make biological sense. All files should be named POP_PLATE-WELL_SAMPLENAMES.
Run dDocent from within each Library directory to map reads to reference.fasta.
During the mapping stage, dDocent calls BWA to map reads from the individuals in the folder to the generated MiSeqReference and create a -RG.bam-file for each individual. The second column of a BAM (or SAM) file contains FLAGs with binary encoded information on mapping, pairedness etc. that can be used to compare the mapping efficiency of the generated MiSeq references.
Count number of reads and mapped reads using samtools idxstats <aln-RG.bam> which will retrieve and print stats in the bam-file. The output is TAB-delimited with each line consisting of reference sequence name, sequence length, # mapped reads and # unmapped (empty) reads. samtools can also be be used to query samtools flagstat file.bam which returns an output containing the number of reads for which each flag is true.
dDocent writes out a file called bamlist.list that contains all the bam files that were generated during read mapping in dDocent using BWA. Write script to gather flagstats from all bam-files.
# write script to gather flagstats
l <- list()
l[["SFL4"]] <- read_table2("data/SEQ/SFL-4/bamlist.list", col_names = "BAM") %>%
mutate(PATH = "data/SEQ/SFL-4/",
COMMAND = "samtools flagstat",
OUT = ">> data/SEQ/SFL4.flagstats") %>%
select( COMMAND, PATH, BAM, OUT) %>%
unite(FILE, 2:3, sep = "")
l[["SFL5"]] <- read_table2("data/SEQ/SFL-5/bamlist.list", col_names = "BAM") %>%
mutate(PATH = "data/SEQ/SFL-5/",
COMMAND = "samtools flagstat",
OUT = ">> data/SEQ/SFL5.flagstats") %>%
select( COMMAND, PATH, BAM, OUT) %>%
unite(FILE, 2:3, sep = "")
l[["SFL6"]] <- read_table2("data/SEQ/SFL-6/bamlist.list", col_names = "BAM") %>%
mutate(PATH = "data/SEQ/SFL-6/",
COMMAND = "samtools flagstat",
OUT = ">> data/SEQ/SFL6.flagstats") %>%
select( COMMAND, PATH, BAM, OUT) %>%
unite(FILE, 2:3, sep = "")
bam <- ldply(l, data.frame) %>%
select(-`.id`)
write_delim(bam, "scr/flagstats.sh", delim = "\t", col_names = FALSE)Run flagstats.
Write script to gather idxstats from all bam-files.
# write script to gather idxstats
l <- list()
l[["SFL4"]] <- read_table2("data/SEQ/SFL-4/bamlist.list", col_names = "BAM") %>%
mutate(PATH = "data/SEQ/SFL-4/",
COMMAND = "samtools idxstats",
OUT = ">> data/SEQ/SFL4.idxstats") %>%
select( COMMAND, PATH, BAM, OUT) %>%
unite(FILE, 2:3, sep = "")
l[["SFL5"]] <- read_table2("data/SEQ/SFL-5/bamlist.list", col_names = "BAM") %>%
mutate(PATH = "data/SEQ/SFL-5/",
COMMAND = "samtools idxstats",
OUT = ">> data/SEQ/SFL5.idxstats") %>%
select( COMMAND, PATH, BAM, OUT) %>%
unite(FILE, 2:3, sep = "")
l[["SFL6"]] <- read_table2("data/SEQ/SFL-6/bamlist.list", col_names = "BAM") %>%
mutate(PATH = "data/SEQ/SFL-6/",
COMMAND = "samtools idxstats",
OUT = ">> data/SEQ/SFL6.idxstats") %>%
select( COMMAND, PATH, BAM, OUT) %>%
unite(FILE, 2:3, sep = "")
bam <- ldply(l, data.frame) %>%
select(-`.id`)
write_delim(bam, "scr/idxstats.sh", delim = "\t", col_names = FALSE)Run indxstats.
Appending the file results in the information per individual being printed in a new set of row being appended to the file, i.e. there will be as many rows for a given locus as individuals were mapped. The file can be re-formatted and summary statistics calculated using dplyr and tidyr.
Format idxstats:
# create vectors of files to be imported, reference codes, K1 and K2, dataframe names
filenames <- list.files(path = "data/SEQ", pattern = "*.idxstats")
names <- substr(filenames, 1, 8)
lib <- substr(filenames, 1, 4)
# import data
for (i in names){
filepath <- file.path("data/SEQ", paste(i, 'stats', sep =""))
assign(i, read.table(filepath, sep = "", header = FALSE,
col.names = c("Locus", "Length", "Reads_Mapped", 'blank')) %>%
select(1:3))
}
# # make sure to delete old list if rerunning the code
# rm(dflist_idx)
# rm(MapStats.idx)
# Create list of one dataframe per idxstats file and group by locus
dflist_idx <- lapply(ls(pattern = "*.idx"), get)
for (df in 1:length(dflist_idx)){
x <- dflist_idx[[df]]
x[['Locus']] <- as.character(x[['Locus']])
x = x %>% group_by(Locus)
dflist_idx[[df]] <- x
}
# Create new dataframes with summary stats per library and bind into final output/dataframe
MapStats.idx <- data.frame()
for (df in 1:length(dflist_idx)){
x = summarize(dflist_idx[[df]],
Length = mean(Length),
Mean_Mapped = mean(Reads_Mapped),
Sum_Mapped = sum(Reads_Mapped),
Min_Mapped = min(Reads_Mapped),
Max_Mapped = max(Reads_Mapped),
SD_Mapped = sd(Reads_Mapped))
x[x == 0] <- NA
temp <- summarize(x, Mean_Mapped_Non0 = mean(Mean_Mapped, na.rm = TRUE)) %>%
mutate(Lib = lib[df],
Not_Mapped = nrow(filter(x, is.na(Sum_Mapped))),
N_Loci_Ref = nrow(x)) %>%
select(Lib, N_Loci_Ref, Not_Mapped, Mean_Mapped_Non0)
MapStats.idx <- bind_rows(MapStats.idx, temp)
}
MapStats.idx <- MapStats.idx %>%
mutate(PROP_EMPTY = round(Not_Mapped/N_Loci_Ref*100, digits = 2),
CONTIGS_MAPPED = N_Loci_Ref - Not_Mapped)
write.table(MapStats.idx, "results/MapStats.idx", quote = FALSE)
# remove large (duplicate) files
rm(SFL4.idx)
rm(SFL5.idx)
rm(SFL6.idx)
MapStats.idxFormat flagstats
# Files to be imported
filenames <- list.files(path='data/SEQ', pattern = '*.flagstats')
# create vectors of files to be imported
names <- substr(filenames, 1, 9)
lib <- substr(filenames, 1, 4)
# import data
for (i in names){
filepath <- file.path('data/SEQ', paste(i, 'stats', sep =""))
assign(i, read.csv(filepath, sep = "+", header = FALSE,
col.names = c("N_Reads", "CAT"),
stringsAsFactors = FALSE) %>%
select(1:2))
}
# Create list of one dataframe per flagstats file and create tidy data set
# should be 3 elements/libraries
rm(dflist_flag)object 'dflist_flag' not found
dflist_flag <- lapply(ls(pattern = "*flag"), get)
# Change N_Reads to numeric
for (df in 1:length(dflist_flag)){
x <- dflist_flag[[df]]
x[['N_Reads']] <- as.numeric(x[['N_Reads']])
dflist_flag[[df]] <- x
}NAs introduced by coercionNAs introduced by coercionNAs introduced by coercion
for (df in 1:length(dflist_flag)){
x <- dflist_flag[[df]]
n <- nrow(x)/14
x <- x %>%
filter(grepl("0 mapped|properly paired|mapQ>=5", CAT)) %>%
mutate(MAPSTAT = ifelse(grepl("mapQ>=5", CAT), "Mismatch",
ifelse(grepl("properly", CAT), "Prop_Paired", "Mapped"))) %>%
mutate(Ind = c(rep(1:n, each = 3))) %>%
# not sure if extra individual in there somehow
select(4, 3, 1) %>%
spread(MAPSTAT, N_Reads)
dflist_flag[[df]] <- x
}
# Create new dataframes with summary stats and add to main final data frame
MapStats.flag <- data.frame()
for (df in 1:length(dflist_flag)){
x = summarize(dflist_flag[[df]], Sum_Mapped = sum(Mapped),
Reads_Mapped = mean(Mapped),
Sum_Paired = sum(Prop_Paired),
Mean_Paired = mean(Prop_Paired),
Sum_Mismatch = sum(Mismatch),
Mean_Mismatch = mean(Mismatch)) %>%
mutate(Lib = lib[df]) %>%
select(7, 1:6)
MapStats.flag <- bind_rows(MapStats.flag, x)
}
# write to file
write.table(MapStats.flag, "results/MapStats.flag", quote = FALSE)
MapStats.flag# combine files
mapstats <- left_join(MapStats.idx, MapStats.flag) %>%
mutate(PROP_MISMATCH = Sum_Mismatch/Sum_Mapped)Joining, by = "Lib"
# write summary stats file
write.table(mapstats, file = "results/BWA_mapping.stats", quote = FALSE, sep = " ")Compare number of reference contigs for which no reads mapped to per library.
# plot no of loci vs "empty" loci
p1 <- ggplot(mapstats, aes(x = Lib, y = PROP_EMPTY)) +
geom_bar(stat = "identity", color = "black", fill = "darkorange") +
scale_y_continuous(limits = c(0, 6)) +
labs(x = "", y = "% contigs w/no reads mapped") +
theme_standard
p2 <- ggplot(mapstats, aes(x = Lib, y = Reads_Mapped)) +
geom_bar(stat = "identity", color = "black", fill = "darkorange") +
labs(x = "library", y = "mean reads mapped per indv") +
theme_standard
p3 <- ggplot(mapstats, aes(x = Lib, y = PROP_MISMATCH)) +
geom_bar(stat = "identity", color = "black", fill = "darkorange") +
labs(x = "",y = "% reads not mapped as pair") +
theme_standard
multiplot(p1, p2, p3, cols = 3)Transfer copy of reference.fasta and associated files for K1 = 4 and K2 = 1 into SNP calling directory.
Create symlinks from all fq, bam and bam.bai-files for each separately mapped library in SNP_Calling folder.
Execute dDocent from within SNP_Calling-folder to call variants across all individuals (all libraries).
File TotalrawSNPs.vcf contains all raw SNP/INDEL calls. Do not need to keep links of fq.gz-, bam-, .bam.bai-files after SNPs have been called. Copy TotalRaSNPs.vcf to VCF_Filtering for SNP filtering.
dDocent uses FreeBayes to call SNPs and write a VCF-file TotalrawSNPs.vcf. This data set was filtered to remove low quality and artefactual SNP sites, paralogs and low quality individuals based on levels of missing data, minimum/maximum read depth, genotype call rate, and minor allele frequencies. Contigs may contain more than one SNP; the script rad_haplotyper.pl was used to create haplotypes for each locus.
Generate a list of all individuals included in the SNP data set called by FreeBayes in the dDocent pipeline.
vcfsamplenames data/VCF/temp/TotalRawSNPs.vcf > data/VCF/raw.indUse Ind_raw file to write text files of individuals in each library and in regional groupings.
# import individuals in raw data set ----
Ind_RAW <- read.table("data/VCF/raw.ind",
header = FALSE, stringsAsFactors = FALSE,
col.names = "LIB_ID") %>%
separate(LIB_ID, into = c("POP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)
# identify duplicates
temp <- Ind_RAW %>%
group_by(SAMPLE_ID) %>%
count() %>%
filter(n > 1) %>%
select(SAMPLE_ID)
dup <- Ind_RAW %>%
filter(SAMPLE_ID %in% temp$SAMPLE_ID) %>%
select(LIB_ID)
write.table(dup, "data/VCF/duplicate.ind",
col.names = FALSE, row.names = FALSE, quote = FALSE)
# create files with individuals by population ----
temp <- Ind_RAW %>%
filter(POP == "AL") %>%
select(LIB_ID)
write.table(temp, "data/VCF/AL.ind",
col.names = FALSE, row.names = FALSE, quote = FALSE)
temp <- Ind_RAW %>%
filter(POP == "FL") %>%
select(LIB_ID)
write.table(temp, "data/VCF/FLG.ind",
col.names = FALSE, row.names = FALSE, quote = FALSE)
temp <- Ind_RAW %>%
filter(POP == "FLA") %>%
select(LIB_ID)
write.table(temp, "data/VCF/FLA.ind",
col.names = FALSE, row.names = FALSE, quote = FALSE)
temp <- Ind_RAW %>%
filter(POP == "LA") %>%
select(LIB_ID)
write.table(temp, "data/VCF/LA.ind",
col.names = FALSE, row.names = FALSE, quote = FALSE)
temp <- Ind_RAW %>%
filter(POP == "MS") %>%
select(LIB_ID)
write.table(temp, "data/VCF/MS.ind",
col.names = FALSE, row.names = FALSE, quote = FALSE)
temp <- Ind_RAW %>%
filter(POP == "SC") %>%
select(LIB_ID)
write.table(temp, "data/VCF/SC.ind",
col.names = FALSE, row.names = FALSE, quote = FALSE)
temp <- Ind_RAW %>%
filter(POP == "TX") %>%
select(LIB_ID)
write.table(temp, "data/VCF/TX.ind",
col.names = FALSE, row.names = FALSE, quote = FALSE)
# create files with individuals per library ----
temp <- Ind_RAW %>%
filter(grepl("4A", LIB) |
grepl("4B", LIB)) %>%
select(LIB_ID)
write.table(temp, "data/VCF/SFL4.ind",
col.names = FALSE, row.names = FALSE, quote = FALSE)
temp <- Ind_RAW %>%
filter(grepl("5A", LIB) |
grepl("5B", LIB)) %>%
select(LIB_ID)
write.table(temp, "data/VCF/SFL5.ind",
col.names = FALSE, row.names = FALSE, quote = FALSE)
temp <- Ind_RAW %>%
filter(grepl("6A", LIB) |
grepl("6B", LIB)) %>%
select(LIB_ID)
write.table(temp, "data/VCF/SFL6.ind",
col.names = FALSE, row.names = FALSE, quote = FALSE)Use VCFtools to create stats files for depth, missing data, heterozygosity and site quality for the raw data.
vcftools --vcf data/VCF/temp/TotalRawSNPs.vcf --out data/VCF/SFL_raw --depth
vcftools --vcf data/VCF/temp/TotalRawSNPs.vcf --out data/VCF/SFL_raw --site-mean-depth
vcftools --vcf data/VCF/temp/TotalRawSNPs.vcf --out data/VCF/SFL_raw --site-quality
vcftools --vcf data/VCF/temp/TotalRawSNPs.vcf --out data/VCF/SFL_raw --missing-indv
vcftools --vcf data/VCF/temp/TotalRawSNPs.vcf --out data/VCF/SFL_raw --missing-site
vcftools --vcf data/VCF/temp/TotalRawSNPs.vcf --out data/VCF/SFL_raw --het
vcftools --vcf data/VCF/temp/TotalRawSNPs.vcf --out data/VCF/SFL_raw --singletonsCompare raw stats.
Remove (known) low quality individuals from data set:
Identify low quality individuals to remove from the data set; defined as individuals with a mean coverage of three or less reads across all loci and more than 85% missing data:
LQindv <- ind_stats_raw %>%
filter(MEAN_DEPTH_SFL_raw < 5 | MISS_SFL_raw >= 0.7) %>%
select(INDV)
View(LQindv)
Idx10 <- ind_stats_raw %>%
filter(grepl("4BE", LIB) |
grepl("4BF", LIB) |
grepl("4BG", LIB) |
grepl("4BH", LIB) |
grepl("6B", LIB)) %>%
select(INDV)
LQindv <- bind_rows(LQindv, Idx10)
cont <- data.frame(INDV = c("FL_5AC02_FL-SF-2016-28", "FL_5AF02_FL-SF-2016-34", "FLA_4AF11_FL-SF-2016-102", "MS_4AC10_MS-SF-2015-01",
"MS_5AD10_MS-SF-2015-52", "SC_4AB01_Ple-01136", "SC_4AB08_Ple-01166", "SC_4AC11_Ple-01833", "SC_4AD01_Ple-01193",
"SC_4AE03_Ple-01194", "SC_4BB08_Ple-01832", "SC_5BB01_Ple-00807", "SC_5BB02_Ple-00808", "SC_5BB04_Ple-00846",
"SC_5BB05_Ple-00921", "SC_5BB06_Ple-00938", "SC_5BB07_Ple-00974", "TX_5BE09_SL-SF-2016-34"), stringsAsFactors = FALSE)
LQindv <- bind_rows(LQindv, cont)
write_delim(LQindv, "data/VCF/LQ_raw.ind", delim = "\t")Remove low quality individuals.
Number of individuals per population.
# import individuals in raw data set ----
Ind_F0 <- read.table("data/VCF/F0.ind",
header = FALSE, stringsAsFactors = FALSE,
col.names = "LIB_ID") %>%
separate(LIB_ID, into = c("POP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)
count(Ind_F0, POP)Compare stats:
# load stats files ----
ind_stats_F0 <- read.ind.stats(dir = "data/VCF", vcf = "SFL.F0") %>%
separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)Joining, by = "INDV"
Joining, by = "INDV"
loc_stats_F0 <- read.loc.stats(dir = "data/VCF/", vcf = "SFL.F0")Joining, by = c("CHR", "POS")
Import singletons and genotype depth file to create list of loci to exclude.
# number of individuals
n <- nrow(ind_stats_F0)+2
# read singletons file
singletons <- read_table2("data/VCF/SFL.F0.singletons") %>%
mutate(VARIANT = ifelse(ALLELE %in% c("A", "T", "C", "G"), "SNP", "INDEL"))
chrom <- unique(singletons$CHROM)
# read genotype depth file and join with singletons
gdepth <- read_table2("data/VCF/SFL.F0.gdepth") %>%
filter(CHROM %in% chrom) %>%
gather(key = INDV, value = DEPTH, 3:n)
singletons <- left_join(singletons, gdepth)
Data set contains nrow(singletons) singletons.
# filter doubletons
doubletons <- singletons %>%
filter(`SINGLETON/DOUBLETON` == "D")Of those nrow(doubletons) are called as a homozygote in one individual.
# number of contigs in data set
contigs_total <- loc_stats_F0 %>%
distinct(CHROM)
contigs_singletons <- singletons %>%
distinct(CHROM)
contigs <- singletons %>%
count(CHROM)
ggplot(contigs, aes(x = n)) +
geom_histogram(binwidth = 1, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(n, na.rm = TRUE)),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "number of singletons per locus") +
theme_standardThe data set contains nrow(contigs_total) contigs, nrow(contigs_singletons) (2round( (nrow(contigs_singletons)/nrow(contigs_total)*100), digits = 2)%) contain singletons.
contigs <- singletons %>%
group_by(`SINGLETON/DOUBLETON`) %>%
count(CHROM)
ggplot(contigs, aes(x = n)) +
geom_histogram(binwidth = 1, color = "black", fill = "darkorange") +
facet_grid(. ~ `SINGLETON/DOUBLETON`) +
labs(x = "number of doubletons/singeltons per locus") +
theme_standardTarget is 20 reads per locus; identify number of singletons/doubletons with depth < 20 reads.
count(singletons, DEPTH <= 20)Distriubtion of genotype depth per singleton/doubleton.
ggplot(singletons, aes(x = DEPTH)) +
geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
facet_grid(. ~ `SINGLETON/DOUBLETON`, scales = "free") +
labs(x = "read depth") +
theme_standardQuantiles distribution of read depth for SNP loci called in only one individual.
quantile(singletons$DEPTH, probs = c(.05, .25, .5, .75, .95, .99))Distribution of doubletons (homzygous genotype for that individual) and singletons (heterozygote genotype).
Ind <- singletons %>%
group_by(`SINGLETON/DOUBLETON`) %>%
count(INDV)
Ind <- left_join(Ind, ind_stats_F0)
ggplot(Ind, aes(x = n)) +
geom_histogram(binwidth = 50, color = "black", fill = "darkorange") +
facet_grid(. ~ `SINGLETON/DOUBLETON`, scales = "free") +
labs(x = "number of singletons per locus") +
theme_standardQuantiles distribution number of singletons called in one individual.
quantile(Ind$n, probs = c(.01, .05, .25, .5, .75, .99))Compare the number of SNPs vs complex variants (INDELs).
count(singletons, VARIANT)The QUAL column of a VCF file is a phred based score indicating the probability that the variant shown in the ALT column is wrong. Given the Phred quality score (Q), and the probability that a base is incorrectly called (P), Q = -10(Log10P).
A quality score of 20 indicates, a 1 in 100 chance that the SNP site has been called incorrectly (i.e. 99% probability that correct call).
Filter loci with quality score < 20 or minor allele count of 3. Code genotypes with genotype call quality < 20 or genotype depth < 3 as missing.
# filter LQ SNP calls
vcftools --vcf data/VCF/temp/SFL.F0.recode.vcf --out data/VCF/temp/SFL.F1 --minQ 20 --minGQ 20 --minDP 3 --mac 3 --recode --recode-INFO-all
# query stats
vcftools --vcf data/VCF/temp/SFL.F1.recode.vcf --out data/VCF/SFL.F1 --depth
vcftools --vcf data/VCF/temp/SFL.F1.recode.vcf --out data/VCF/SFL.F1 --site-mean-depth
vcftools --vcf data/VCF/temp/SFL.F1.recode.vcf --out data/VCF/SFL.F1 --missing-indv
vcftools --vcf data/VCF/temp/SFL.F1.recode.vcf --out data/VCF/SFL.F1 --missing-site
vcftools --vcf data/VCF/temp/SFL.F1.recode.vcf --out data/VCF/SFL.F1 --het
vcftools --vcf data/VCF/temp/SFL.F1.recode.vcf --out data/VCF/SFL.F1 --geno-depth
vcftools --vcf data/VCF/temp/SFL.F1.recode.vcf --out data/VCF/SFL.F1 --singletons
vcftools --vcf data/VCF/temp/SFL.F1.recode.vcf --out data/VCF/SFL.F1 --indv-freq-burden
vcftools --vcf data/VCF/temp/SFL.F1.recode.vcf --out data/VCF/SFL.F1 --freq Compare stats post-filtering.
# load stats files ----
ind_stats_F1 <- read.ind.stats(dir = "data/VCF", vcf = "SFL.F1") %>%
separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)
loc_stats_F1 <- read.loc.stats(dir = "data/VCF/", vcf = "SFL.F1")Data set contains 370 individuals and 386852 SNP sites.
Removing low confidence SNP loci and genotype calls results in a reduction in the number of the (maximum) SNPs per locus.
p1 <- loc_stats_raw %>%
count(CHR) %>%
ggplot(aes(x = n)) +
geom_histogram(binwidth = 1, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(n, na.rm = TRUE)),
color = "darkblue", linetype = "dashed", size = 1) +
scale_x_continuous(limits = c(0, 100)) +
labs(x = "number of SNPs per locus") +
theme_standard
p2 <- loc_stats_F1 %>%
count(CHR) %>%
ggplot(aes(x = n)) +
geom_histogram(binwidth = 1, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(n, na.rm = TRUE)),
color = "darkblue", linetype = "dashed", size = 1) +
scale_x_continuous(limits = c(0, 100)) +
labs(x = "number of SNPs per locus") +
theme_standard
m1a <- multiplot(p1, p2, cols=1)Coding genotypes with low read depth (< 3) as missing, results in an overall increase in missing data per locus and a shift in more individuals with more missing data - this is because individuals (and loci) with coverage issues are now characterized by higher missing data.
iraw <- read.table("data/VCF/SFL_raw.imiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(INDV, F_MISS) %>%
rename(raw = F_MISS)
imiss <- read.table("data/VCF/SFL.F1.imiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(INDV, F_MISS) %>%
rename(F1 = F_MISS)
imiss <- left_join(imiss, iraw)
p1 <- ggplot(imiss, aes(x = raw, y = F1)) +
geom_point(shape = 1) +
geom_abline(slope = 1, color = "darkblue", linetype = "dashed", size = 1) +
scale_x_continuous(limits = c(0, 1)) +
scale_y_continuous(limits = c(0, 1)) +
labs(x = "indv missing data raw", y = "indv missing data F1") +
theme_standard
lraw <- read.table("data/VCF/SFL_raw.lmiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(CHR, POS, F_MISS) %>%
rename(raw = F_MISS)
lmiss <- read.table("data/VCF/SFL.F1.lmiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(CHR, POS, F_MISS) %>%
rename(F1 = F_MISS)
lmiss <- left_join(lmiss, lraw)
p2 <- ggplot(lmiss, aes(x = raw, y = F1)) +
geom_point(shape = 1) +
geom_abline(slope = 1, color = "darkblue", linetype = "dashed", size = 1) +
scale_x_continuous(limits = c(0, 1)) +
scale_y_continuous(limits = c(0, 1)) +
labs(x = "loci missing data raw", y = "loci missing data F1") +
theme_standard
m1b <- multiplot(p1, p2, cols=2)Compare depth individuals after removing LQ SNP loci and genotypes.
iraw <- read.table("data/VCF/SFL_raw.idepth",
header = TRUE, stringsAsFactors = FALSE) %>%
select(INDV, MEAN_DEPTH) %>%
rename(raw = MEAN_DEPTH)
idepth <- read.table("data/VCF/SFL.F1.idepth",
header = TRUE, stringsAsFactors = FALSE) %>%
select(INDV, MEAN_DEPTH) %>%
rename(F1 = MEAN_DEPTH)
idepth <- left_join(iraw, idepth)
ggplot(idepth, aes(x = raw, y = F1)) +
geom_point(shape = 1) +
geom_abline(slope = 1, color = "darkblue", linetype = "dashed", size = 1) +
scale_x_continuous(limits = c(0, 150)) +
scale_y_continuous(limits = c(0, 150)) +
labs(x = "mean depth ind raw", y = "mean depth ind F1") +
theme_standardRemove loci with genotype call rate < 50% and mean depth < 10 reads across all individuals.
vcftools --vcf data/VCF/temp/SFL.F1.recode.vcf --out data/VCF/temp/SFL.F2a --max-missing 0.5 --min-meanDP 10 --recode --recode-INFO-all
# library SFL-4
vcftools --vcf data/VCF/temp/SFL.F2a.recode.vcf --out data/VCF/temp/SFL4 --keep data/VCF/SFL4.ind --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/SFL4.recode.vcf --out data/VCF/SFL4 --missing-site
# library SFL-5
vcftools --vcf data/VCF/temp/SFL.F2a.recode.vcf --out data/VCF/temp/SFL5 --keep data/VCF/SFL5.ind --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/SFL5.recode.vcf --out data/VCF/SFL5 --missing-site
# library SFL-6
vcftools --vcf data/VCF/temp/SFL.F2a.recode.vcf --out data/VCF/temp/SFL6 --keep data/VCF/SFL6.ind --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/SFL6.recode.vcf --out data/VCF/SFL6 --missing-site
# Alabama
vcftools --vcf data/VCF/temp/SFL.F2a.recode.vcf --out data/VCF/temp/AL --keep data/VCF/AL.ind --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/AL.recode.vcf --out data/VCF/AL --missing-site
# Florida Atlantic
vcftools --vcf data/VCF/temp/SFL.F2a.recode.vcf --out data/VCF/temp/FLA --keep data/VCF/FLA.ind --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/FLA.recode.vcf --out data/VCF/FLA --missing-site
# Florida Gulf
vcftools --vcf data/VCF/temp/SFL.F2a.recode.vcf --out data/VCF/temp/FLG --keep data/VCF/FLG.ind --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/FLG.recode.vcf --out data/VCF/FLG --missing-site
# Louisiana
vcftools --vcf data/VCF/temp/SFL.F2a.recode.vcf --out data/VCF/temp/LA --keep data/VCF/LA.ind --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/LA.recode.vcf --out data/VCF/LA --missing-site
# Mississippi
vcftools --vcf data/VCF/temp/SFL.F2a.recode.vcf --out data/VCF/temp/MS --keep data/VCF/MS.ind --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/MS.recode.vcf --out data/VCF/MS --missing-site
# South Carolina
vcftools --vcf data/VCF/temp/SFL.F2a.recode.vcf --out data/VCF/temp/SC --keep data/VCF/SC.ind --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/SC.recode.vcf --out data/VCF/SC --missing-site
# Texas
vcftools --vcf data/VCF/temp/SFL.F2a.recode.vcf --out data/VCF/temp/TX --keep data/VCF/TX.ind --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/TX.recode.vcf --out data/VCF/TX --missing-siteCompare distribution of missing data per locus per library.
# create empty list
loc_missing <- list()
# import missing data per locus
loc_missing[[1]] <- read.table("data/VCF/SFL4.lmiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(CHR, POS, F_MISS) %>%
mutate(LIB = "SFL4")
loc_missing[[2]] <- read.table("data/VCF/SFL5.lmiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(CHR, POS, F_MISS) %>%
mutate(LIB = "SFL5")
loc_missing[[3]] <- read.table("data/VCF/SFL6.lmiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(CHR, POS, F_MISS) %>%
mutate(LIB = "SFL6")
# create data frame with all information
loc_missing <- ldply(loc_missing, data.frame)
ggplot(loc_missing, aes(x = F_MISS)) +
geom_histogram(binwidth = .1, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = 0.5),
color = "darkred", linetype = "dashed", size = 1) +
labs(x = "missing data per locus") +
facet_grid(LIB ~ .) +
theme_standardFlag loci that were not called in > 50% of individuals in a given library.
# identify loci with high missing data in each library
SNPs <- filter(loc_missing, F_MISS > 0.5) %>%
arrange(CHR, POS)
count(SNPs, LIB)LQloci_lib <- SNPs %>%
select(CHR, POS) %>%
unique()loci were called in less than 50% of individuals in one or more libraries. Loci being inconsistently called between libraries can result in library effects.
Compare distribution of missing data per locus per sample location.
# create empty list
loc_missing <- list()
# import missing data per locus
loc_missing[[1]] <- read.table("data/VCF/AL.lmiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(CHR, POS, F_MISS) %>%
mutate(POP = "AL")
loc_missing[[2]] <- read.table("data/VCF/FLA.lmiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(CHR, POS, F_MISS) %>%
mutate(POP = "FLA")
loc_missing[[3]] <- read.table("data/VCF/FLG.lmiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(CHR, POS, F_MISS) %>%
mutate(POP = "FLG")
loc_missing[[4]] <- read.table("data/VCF/LA.lmiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(CHR, POS, F_MISS) %>%
mutate(POP = "LA")
loc_missing[[5]] <- read.table("data/VCF/MS.lmiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(CHR, POS, F_MISS) %>%
mutate(POP = "MS")
loc_missing[[6]] <- read.table("data/VCF/SC.lmiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(CHR, POS, F_MISS) %>%
mutate(POP = "SC")
loc_missing[[7]] <- read.table("data/VCF/TX.lmiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(CHR, POS, F_MISS) %>%
mutate(POP = "TX")
# create data frame with all information
loc_missing <- ldply(loc_missing, data.frame)
ggplot(loc_missing, aes(x = F_MISS)) +
geom_histogram(binwidth = .05, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = 0.5),
color = "darkred", linetype = "dashed", size = 1) +
labs(x = "missing data per locus") +
facet_grid(POP ~ .) +
theme_standardFlag loci that were not called in > 50% of individuals at a given sample location.
# identify loci with high missing data in each library
SNPs <- filter(loc_missing, F_MISS > 0.5) %>%
arrange(CHR, POS)
count(SNPs, POP)LQloci_pop <- SNPs %>%
select(CHR, POS) %>%
unique()
LQloci <- bind_rows(LQloci_lib, LQloci_pop) %>%
unique()
# Write contig/position to file
write.table(LQloci, file = "data/VCF/LQ_F2b.loci",
col.names= FALSE, row.names = FALSE, quote = FALSE)were flagged as being called in less than 50% of individuals in one or more libraries or in one or more sample locations.
Remove loci that were not consistently called across all libraries and sample locations.
vcftools --vcf data/VCF/temp/SFL.F2a.recode.vcf --out data/VCF/temp/SFL.F2b --exclude-positions data/VCF/LQ_F2b.loci --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/SFL.F2b.recode.vcf --out data/VCF/SFL.F2b --missing-indvIdentify individuals with > 40% missing data
# determine cutoff
imiss <- read.table("data/VCF/SFL.F2b.imiss",
header = TRUE, stringsAsFactors = FALSE)
ggplot(imiss, aes(x = F_MISS)) +
geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
geom_vline(xintercept = 0.4, color = "darkblue", linetype = "dashed", size = 1) +
theme_standardimiss <- imiss %>%
filter(F_MISS > 0.4) %>%
select(INDV)
View(imiss)
write.table(imiss, "data/VCF/F2b_LQ.indv",
col.names = TRUE, row.names = FALSE, quote = FALSE)Remove flagged individuals.
vcftools --vcf data/VCF/temp/SFL.F2b.recode.vcf --out data/VCF/temp/SFL.F2 --remove data/VCF/F2b_LQ.indv --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/SFL.F2.recode.vcf --out data/VCF/SFL.F2 --depth
vcftools --vcf data/VCF/temp/SFL.F2.recode.vcf --out data/VCF/SFL.F2 --site-mean-depth
vcftools --vcf data/VCF/temp/SFL.F2.recode.vcf --out data/VCF/SFL.F2 --missing-indv
vcftools --vcf data/VCF/temp/SFL.F2.recode.vcf --out data/VCF/SFL.F2 --missing-site
vcftools --vcf data/VCF/temp/SFL.F2.recode.vcf --out data/VCF/SFL.F2 --het
vcftools --vcf data/VCF/temp/SFL.F2.recode.vcf --out data/VCF/SFL.F2 --geno-depthAnalyze stats post-filtering:
# load stats files ----
ind_stats_F2 <- read.ind.stats(dir = "data/VCF", vcf = "SFL.F2") %>%
separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)
loc_stats_F2 <- read.loc.stats(dir = "data/VCF/", vcf = "SFL.F2")Compare change in missing data of loci and individuals.
# import imiss ----
# create empty list
imiss <- list()
# import missing data per locus
imiss[[1]] <- read.table("data/VCF/SFL_raw.imiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(INDV, F_MISS) %>%
mutate(FIL = "raw")
imiss[[2]] <- read.table("data/VCF/SFL.F1.imiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(INDV, F_MISS) %>%
mutate(FIL = "F1")
imiss[[3]] <- read.table("data/VCF/SFL.F2.imiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(INDV, F_MISS) %>%
mutate(FIL = "F2")
# create data frame with all information
imiss <- ldply(imiss, data.frame) %>%
spread(key = FIL, value = F_MISS)
# import lmiss ----
# create empty list
lmiss <- list()
# import missing data per locus
lmiss[[1]] <- read.table("data/VCF/SFL_raw.lmiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(CHR, POS, F_MISS) %>%
mutate(FIL = "raw")
lmiss[[2]] <- read.table("data/VCF/SFL.F1.lmiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(CHR, POS, F_MISS) %>%
mutate(FIL = "F1")
lmiss[[3]] <- read.table("data/VCF/SFL.F2.lmiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(CHR, POS, F_MISS) %>%
mutate(FIL = "F2")
# create data frame with all information
lmiss <- ldply(lmiss, data.frame) %>%
spread(key = FIL, value = F_MISS)
# plot ----
p1 <- ggplot(imiss, aes(x = F2, y = raw)) +
geom_point(shape = 1) +
geom_abline(slope = 1, color = "darkblue", linetype = "dashed", size = 1) +
scale_x_continuous(limits = c(0, 1)) +
scale_y_continuous(limits = c(0, 1)) +
labs(x = "imiss F2", y = "imiss raw") +
theme_standard
p2 <- ggplot(lmiss, aes(x = F2, y = raw)) +
geom_point(shape = 1) +
geom_abline(slope = 1, color = "darkblue", linetype = "dashed", size = 1) +
scale_x_continuous(limits = c(0, 1)) +
scale_y_continuous(limits = c(0, 1)) +
labs(x = "lmiss F2", y = "lmiss raw") +
theme_standard
m2 <- multiplot(p1, p2, cols=2)Determine mean depth and variance per locus per library.
# library SFL-4
vcftools --vcf data/VCF/temp/SFL.F2.recode.vcf --out data/VCF/temp/SFL4 --keep data/VCF/SFL4.ind --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/SFL4.recode.vcf --out data/VCF/SFL4 --site-mean-depth
# library SFL-5
vcftools --vcf data/VCF/temp/SFL.F2.recode.vcf --out data/VCF/temp/SFL5 --keep data/VCF/SFL5.ind --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/SFL5.recode.vcf --out data/VCF/SFL5 --site-mean-depth
# library SFL-6
vcftools --vcf data/VCF/temp/SFL.F2.recode.vcf --out data/VCF/temp/SFL6 --keep data/VCF/SFL6.ind --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/SFL6.recode.vcf --out data/VCF/SFL6 --site-mean-depthCompare distribution of depth coverage per locus per library. Identify loci that do not have consistent coverage between libraries (can lead to library effects), and/or across individuals.
# create empty list
loc_depth <- list()
# import depth data per locus
loc_depth[[1]] <- read.table("data/VCF/SFL4.ldepth.mean",
header = TRUE, stringsAsFactors = FALSE) %>%
mutate(LIB = "SFL4")
loc_depth[[2]] <- read.table("data/VCF/SFL5.ldepth.mean",
header = TRUE, stringsAsFactors = FALSE) %>%
mutate(LIB = "SFL5")
loc_depth[[3]] <- read.table("data/VCF/SFL6.ldepth.mean",
header = TRUE, stringsAsFactors = FALSE) %>%
mutate(LIB = "SFL6")
# create data frame with all information
loc_depth <- ldply(loc_depth, data.frame)
ggplot(loc_depth, aes(x = MEAN_DEPTH)) +
geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(MEAN_DEPTH, na.rm = TRUE)),
color = "darkred", linetype = "dashed", size = 1) +
geom_vline(xintercept = 20,
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "mean depth per locus") +
facet_grid(LIB ~ .) +
theme_standardIdentify loci with large variance in mean depth across libraries and/or individuals by calculating the coefficient of variance (STD/MEAN).
# calculate mean depth weighted by library
depth_comp <- loc_depth %>%
group_by(LIB) %>%
distinct(CHROM, .keep_all = TRUE) %>%
select(-POS) %>%
ungroup() %>%
group_by(CHROM) %>%
summarise(MEAN = mean(MEAN_DEPTH),
STD = sd(MEAN_DEPTH))
# mean depth across all individuals
temp <- read.table("data/VCF/SFL.F2.ldepth.mean",
header = TRUE, stringsAsFactors = FALSE) %>%
distinct(CHROM, .keep_all = TRUE) %>%
select(-POS) %>%
mutate(STD_DEPTH = sqrt(VAR_DEPTH))
# calculate coefficient of variation
depth_comp <- left_join(depth_comp, temp) %>%
mutate(COEFF_VAR_LIB = STD/MEAN*100,
COEFF_VAR_IND = STD_DEPTH/MEAN_DEPTH*100)Compare mean across all individual to mean weighted by library and coefficient of variance of read depth across individuals and between libraries:
p1 <- ggplot(depth_comp, aes(x = MEAN_DEPTH, y = MEAN)) +
geom_point(shape = 1) +
labs(x = "mean depth per locus", y = "mean weighted by lib") +
geom_abline(slope = 1, linetype = "dashed", color = "darkred", size = 1) +
theme_standard
p2 <- ggplot(depth_comp, aes(x = COEFF_VAR_IND)) +
geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = quantile(COEFF_VAR_IND, .99)),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "coeff var depth across all indv") +
theme_standard
p3 <- ggplot(depth_comp, aes(x = COEFF_VAR_IND, y = COEFF_VAR_LIB)) +
geom_point(shape = 1) +
scale_x_continuous(limits = c(0, 400)) +
scale_y_continuous(limits = c(0, 200)) +
labs(x = "coeff var depth across all indv", y = "coeff var mean depth betw lib") +
theme_standard
p4 <- ggplot(depth_comp, aes(x = COEFF_VAR_LIB)) +
geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = quantile(COEFF_VAR_LIB, .99)),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "coeff var depth between lib") +
theme_standard
m3a <- multiplot(p1, p2, p3, p4, cols=2)If loci have consistent coverage across loci the mean read depth per locus across all individuals and weighted by library should fall on the red diagonal.
Flag loci that have high variation in mean coverage between individuals and between libraries.
# identify loci high variation in coverage
contigs_var <- depth_comp %>%
filter(COEFF_VAR_LIB > 110 | COEFF_VAR_IND > 200)
SNPs_var <- filter(loc_depth, CHROM %in% contigs_var$CHROM) %>%
distinct(CHROM, POS)
# Write contig/position to text file, use file with vcftools to remove positions from dataset
write.table(SNPs_var, file = "data/VCF/LQ_F3.loci",
col.names= FALSE, row.names = FALSE, quote = FALSE)Remove loci flagged for high variance in depth across all individuals and between libraries (removes library effects), filter loci with mean depth < 15 and genotype call rate < 75%
vcftools --vcf data/VCF/temp/SFL.F2.recode.vcf --out data/VCF/temp/SFL.F3a --exclude-positions data/VCF/LQ_F3.loci --min-meanDP 15 --max-missing 0.75 --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/SFL.F3a.recode.vcf --out data/VCF/SFL.F3a --depth
vcftools --vcf data/VCF/temp/SFL.F3a.recode.vcf --out data/VCF/SFL.F3a --geno-depthCompare mean depth and number of sites called per individual.
read_table2("data/VCF/SFL.F3a.idepth") %>%
ggplot(aes(x = N_SITES, y = MEAN_DEPTH)) +
geom_point(shape = 1, size = 1.5) +
geom_hline(yintercept = 10, linetype = "dashed", color = "darkred", size = 0.75) +
geom_hline(yintercept = 20, linetype = "dashed", color = "darkblue", size = 0.75) +
labs(x = "number of sites", y = "mean read depth") +
theme_standardUse genotype depth file to identify individuals with high variance in read depth across loci.
# number of individuals
n <- nrow(ind_stats_F2)+1
# read genotype depth & code values < 5 as missing
gdepth <- read_table2("data/VCF/SFL.F3a.gdepth") %>%
select(-POS) %>%
distinct(CHROM, .keep_all = TRUE) %>%
gather(key = INDV, value = DEPTH, 3:n) %>%
mutate(DEPTH = as.numeric(gsub(-1, 0, DEPTH)),
DEPTH = as.numeric(gsub("\\<1\\>", 0, DEPTH)),
DEPTH = as.numeric(gsub("\\<2\\>", 0, DEPTH)))
# DEPTH = as.numeric(gsub("\\<3\\>", 0, DEPTH)),
# DEPTH = as.numeric(gsub("\\<4\\>", 0, DEPTH)))
# calculate summary statistics
idepth <- gdepth %>%
group_by(INDV) %>%
summarize(TOTAL = sum(DEPTH),
MAX = max(DEPTH),
MEAN_NON0 = mean(DEPTH[DEPTH > 0]),
MEAN = mean(DEPTH),
MEDIAN = median(DEPTH),
VAR = var(DEPTH),
STD = sd(DEPTH)) %>%
mutate(COEFF_VAR = STD/MEAN,
RATIO_MEAN_MEDIAN = MEAN/MEDIAN)
idepth <- left_join(idepth, ind_stats_F2)Compare variability of depth within individuals.
p1 <- ggplot(idepth, aes(x = MAX, y = TOTAL)) +
geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
geom_point(shape = 1) +
theme_standard
p2 <- ggplot(idepth, aes(x = TOTAL, y = VAR)) +
geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
geom_point(shape = 1) +
theme_standard
p3 <- ggplot(idepth, aes(MAX, MEDIAN)) +
geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
geom_point(shape = 1) +
theme_standard
p4 <- ggplot(idepth, aes(x = MEAN, y = MEDIAN)) +
geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
geom_point(shape = 1) +
theme_standard
p5 <- ggplot(idepth, aes(x = COEFF_VAR, y = RATIO_MEAN_MEDIAN)) +
geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
geom_point(shape = 1) +
theme_standard
p6 <- ggplot(idepth, aes(MEAN)) +
geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
geom_vline(xintercept = 15, color = "darkred", linetype = "dashed") +
theme_standard
p7 <- ggplot(idepth, aes(MEDIAN)) +
geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
geom_vline(xintercept = 10, color = "darkred", linetype = "dashed") +
theme_standard
p8 <- ggplot(idepth, aes(RATIO_MEAN_MEDIAN)) +
geom_histogram(binwidth = 0.1, color = "black", fill = "darkorange") +
theme_standard
p9 <- ggplot(idepth, aes(COEFF_VAR)) +
geom_histogram(binwidth = 0.025, color = "black", fill = "darkorange") +
theme_standard
p10 <- ggplot(idepth, aes(x = COEFF_VAR, y = MISS_SFL.F2)) +
geom_point(shape = 1) +
geom_hline(yintercept = 0.25, linetype = "dashed", color = "darkred", size = 0.75) +
theme_standard
m3b <- multiplot(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, cols=2)Flag LQ individuals.
LQ_depth <- idepth %>%
filter(MEAN <= 10 | MEDIAN <= 5) %>%
select(INDV)
# LQ_miss <- read_table2("data/VCF/SFL.F3a.idepth") %>%
# filter(N_SITES < 112000) %>%
# select(INDV)
#
# LQ_ind <- bind_rows(LQ_depth, LQ_miss)
LQ_depth
write.table(LQ_depth, "data/VCF/F3_LQ.indv",
col.names = TRUE, row.names = FALSE, quote = FALSE)Remove flagged loci and individuals.
vcftools --vcf data/VCF/temp/SFL.F3a.recode.vcf --out data/VCF/temp/SFL.F3 --remove data/VCF/F3_LQ.indv --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/SFL.F3.recode.vcf --out data/VCF/SFL.F3 --depth
vcftools --vcf data/VCF/temp/SFL.F3.recode.vcf --out data/VCF/SFL.F3 --site-mean-depth
vcftools --vcf data/VCF/temp/SFL.F3.recode.vcf --out data/VCF/SFL.F3 --missing-indv
vcftools --vcf data/VCF/temp/SFL.F3.recode.vcf --out data/VCF/SFL.F3 --missing-site
vcftools --vcf data/VCF/temp/SFL.F3.recode.vcf --out data/VCF/SFL.F3 --hetCompare stats post-filtering:
# load stats files ----
ind_stats_F3 <- read.ind.stats(dir = "data/VCF", vcf = "SFL.F3") %>%
separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)
loc_stats_F3 <- read.loc.stats(dir = "data/VCF/", vcf = "SFL.F3")
# plot missing data per indv ----
p1 <- ggplot(ind_stats_F3, aes(x = MISS_SFL.F3)) +
geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(MISS_SFL.F3, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 0.5),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "missing data per indv") +
theme_standard
# plot read depth per indv ----
p2 <- ggplot(ind_stats_F3, aes(x = MEAN_DEPTH_SFL.F3)) +
geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(MEAN_DEPTH_SFL.F3, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 20),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "mean read depth per indv") +
theme_standard
# plot depth vs missing ----
p3 <- ggplot(ind_stats_F3, aes(x = MEAN_DEPTH_SFL.F3, y = MISS_SFL.F3)) +
geom_point() +
geom_vline(aes(xintercept = mean(MEAN_DEPTH_SFL.F3, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 20),
color = "darkblue", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = mean(MISS_SFL.F3, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = 0.5),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "mean depth per indv", y = "% missing data") +
theme_standard
# plot distribution missing data per locus ----
p4 <- ggplot(loc_stats_F3, aes(x = MISS_SFL.F3)) +
geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(MISS_SFL.F3, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 0.1),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "% missing data per locus") +
theme_standard
# plot distribution mean read depth ----
p5 <- ggplot(loc_stats_F3, aes(x = MEAN_DEPTH_SFL.F3)) +
geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(MEAN_DEPTH_SFL.F3, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 20),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "mean read depth per locus") +
theme_standard
# plot read depth vs missing data ----
p6 <- ggplot(loc_stats_F3, aes(x = MEAN_DEPTH_SFL.F3, y = MISS_SFL.F3)) +
geom_point() +
geom_vline(aes(xintercept = mean(MEAN_DEPTH_SFL.F3, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 20),
color = "darkblue", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = mean(MISS_SFL.F3, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = 0.1),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "mean depth per locus", y = "% missing data") +
theme_standard
m3 <- multiplot(p1, p2, p3, p4, p5, p6, cols=2)AB: Allele balance at heterozygous sites: a number between 0 and 1 representing the ratio of reads showing the reference allele to all reads, considering only reads from individuals called as heterozygous
Allele balance is the ratio of reads for reference allele to all reads, considering only reads from individuals called as heterozygous. Values range from 0 - 1; allele balance (for real loci) should be approx. 0.5. Filter contigs SNPs for which the with allele balance < 0.25 and > 0.75.
read.table("data/VCF/temp/SFL.F4.AB",
col.names = "AB", stringsAsFactors = FALSE) %>%
ggplot(aes(x = AB)) +
geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
geom_vline(xintercept = 0.5, color = "red", linetype = "dashed") +
geom_vline(xintercept = 0.25, color = "darkblue", linetype = "dashed") +
geom_vline(xintercept = 0.75, color = "darkblue", linetype = "dashed") +
theme_standardFilter contigs with SNP calls with AB > 0.25, AB > 0.75; retain loci very close to 0 (retain loci that are fixed variants). Remove genotypes if the quality sum of the reference or alternate allele was 0.
Remaining SNPs: 104,387
# site depth
cut -f8 SFL.F4.vcf | grep -oe "DP=[0-9]*" | sed -s 's/DP=//g' > SFL.4.DEPTH
# quality score
mawk '!/#/' SFL.F4.vcf | cut -f1,2,6 > SFL.F4.loci.qualCompare quality/depth ratio.
# depth
depth <- read.table("data/VCF/temp/SFL.4.DEPTH",
col.names = "depth")
# quality score
qual <- read.table("data/VCF/temp/SFL.F4.loci.qual",
col.names = c("locus", "pos", "qual"))
temp <- bind_cols(qual, depth) %>%
mutate(ratio = qual/depth)
ggplot(temp, aes(x = ratio)) +
geom_histogram(binwidth = 0.25, color = "black", fill = "grey85") +
geom_vline(xintercept = 0.2, color = "darkred", linetype = "dashed", size = 1) +
theme_standardRemove loci with quality/depth ratio < 0.2
vcffilter -s -f "QUAL / DP > 0.2" SFL.F4.vcf > SFL.F5.vcf
mawk '!/#/' SFL.F5.vcf | wc -lRemaining SNPs: 71,921
Remove loci based on ratio of mapping quality for reference and alternate allele, i.e. sites that have a high discrepancy between the mapping qualities of two alleles.
temp <- read.table("data/VCF/temp/SFL.F5.MQM", col.names = "MQM")
mapqual <- read.table("data/VCF/temp/SFL.F5.MQMR", col.names = "MQMR")
mapqual <- bind_cols(mapqual, temp) %>%
mutate(ratio = MQM/MQMR)
filter <- mapqual %>%
filter(ratio < 0.25 | ratio > 1.75)
ggplot(mapqual, aes(x = MQM, y = MQMR)) +
geom_point(shape = 1, size = 1) +
geom_abline(intercept = 0, slope = 1, size = 1, color = "red", linetype = "dashed") +
geom_abline(intercept = 0, slope = 4, size = 1, color = "darkblue", linetype = "dashed") +
geom_abline(intercept = 0, slope = 0.571, size = 1, color = "darkblue", linetype = "dashed") +
geom_point(data = filter, aes(x = MQM, y = MQMR), shape = 21, color = "black", fill = "red") +
scale_x_continuous(limits = c(0, 65)) +
scale_y_continuous(limits = c(0, 65)) +
theme_standardFilter loci with mapping quality ratio < 0.25 and > 1.75.
vcffilter -s -f "MQM / MQMR > 0.25 & MQM / MQMR < 1.75" SFL.F5.vcf > SFL.F6.vcf
mawk '!/#/' SFL.F6.vcf | wc -lRemaining SNPs: 71,865.
SRF: Number of reference observations on the forward strand SAF: Number of alternate observations on the forward strand SRR: Number of reference observations on the reverse strand SAR: Number of alternate observations on the reverse strand
Paired end reads should not overlap, and a SNP site should only be covered by either the forward or reverse read (strand).
SAF <- read.table("data/VCF/temp/SFL.F6.SAF",
col.names = "SAF")
SAR <- read.table("data/VCF/temp/SFL.F6.SAR",
col.names = "SAR")
strands <- bind_cols(SAF, SAR)
SRF <- read.table("data/VCF/temp/SFL.F6.SRF",
col.names = "SRF")
strands <- bind_cols(strands, SRF)
SRR <- read.table("data/VCF/temp/SFL.F6.SRR",
col.names = "SRR")
strands <- bind_cols(strands, SRR) %>%
mutate(ratioA = SAF/SAR, ratioR = SRF/SRR)
ggplot(strands, aes(x = SAF, y = SAR)) +
geom_point(shape = 1) +
geom_abline(intercept = 0, slope = 0.1, color = "blue", linetype = "dashed", size = 1) +
geom_abline(intercept = 0, slope = 100, color = "blue", linetype = "dashed", size = 1) +
theme_standardRemove SNP sites that have > 100x more forward alternate reads than reverse alternate reads and > 100x more forward reverse reads than reverse alternate reads.
vcffilter -f "SAF / SAR > 100 & SRF / SRR > 100 | SAR / SAF > 100 & SRR / SRF > 100" -s SFL.F6.vcf > SFL.F7.vcf
mawk '!/#/' SFL.F7.vcf | wc -lNumber of SNPs remaining: 67,441.
PAIRED: Proportion of observed alternate alleles which are supported by properly paired read fragments PAIREDR: Proportion of observed reference alleles which are supported by properly paired read fragments
Identify loci with only unpaired reads mapping to them - an artifact introduced due de novo reference assembly. Compare number of paired reads mapping the reference and the alternate alleles to identify discrepancy in the paired status for reads supporting reference and alternate alleles.
Number of SNPs in data set: 67,168.
Identify distribution of depth (based on original data set) to identify loci with excess coverage.
Original number of individuals in data set is 508 (INFO flags in filtered data set are are based on original number of individuals in data set).
Create file with the original site depth and quality score for each site:
Calculate average depth and standard deviation:
Mean depth per locus (across all indivuals) is 4.497540710^{4} and the standard deviation is 2.466598410^{4}.
Filter SNP site with depth > mean depth + 1 standard deviation = 9.430737410^{4} and that have quality scores < 2x the depth at that site and output depth per site.
Compare the distribution of mean depth per site averaged across individuals to determine cut-off value of sites with excessively high depth indicative of paralogs/multicopy loci.
# calculate mean depth per site (177 individuals)
read.table("data/VCF/SFL.F9a.ldepth.mean",
header = TRUE, stringsAsFactors = FALSE) %>%
ggplot(aes(x = MEAN_DEPTH)) +
geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(MEAN_DEPTH, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = quantile(MEAN_DEPTH, .95)),
color = "darkblue", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = quantile(MEAN_DEPTH, .99)),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "mean depth per site") +
theme_standardChoose cut-off for maximum mean read depth = 220.
vcftools --vcf data/VCF/temp/SFL.F8.vcf --out data/VCF/temp/SFL.F9 --max-meanDP 220 --exclude-positions data/VCF/temp/LQ_F9.loci --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/SFL.F9.recode.vcf --out data/VCF/SFL.F9 --depth
vcftools --vcf data/VCF/temp/SFL.F9.recode.vcf --out data/VCF/SFL.F9 --site-mean-depth
vcftools --vcf data/VCF/temp/SFL.F9.recode.vcf --out data/VCF/SFL.F9 --missing-indv
vcftools --vcf data/VCF/temp/SFL.F9.recode.vcf --out data/VCF/SFL.F9 --missing-site
vcftools --vcf data/VCF/temp/SFL.F9.recode.vcf --out data/VCF/SFL.F9 --hetAnalyze stats post-filtering:
# load stats files ----
ind_stats_F9 <- read.ind.stats(dir = "data/VCF", vcf = "SFL.F9") %>%
separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)
loc_stats_F9 <- read.loc.stats(dir = "data/VCF/", vcf = "SFL.F9")Data set contains 65897 SNP sites and 361 individuals.
Remove Indels from data set.
vcfallelicprimitives data/VCF/temp/SFL.F9.recode.vcf --keep-info --keep-geno > data/VCF/temp/SFL.prim.vcf
vcftools --vcf data/VCF/temp/SFL.prim.vcf --out data/VCF/temp/SFL.F10 --remove-indels --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/SFL.F10.recode.vcf --out data/VCF/SFL.F10 --depth
vcftools --vcf data/VCF/temp/SFL.F10.recode.vcf --out data/VCF/SFL.F10 --site-mean-depth
vcftools --vcf data/VCF/temp/SFL.F10.recode.vcf --out data/VCF/SFL.F10 --missing-indv
vcftools --vcf data/VCF/temp/SFL.F10.recode.vcf --out data/VCF/SFL.F10 --missing-site
vcftools --vcf data/VCF/temp/SFL.F10.recode.vcf --out data/VCF/SFL.F10 --hetAnalyze stats post-filtering:
# load stats files ----
ind_stats_F10 <- read.ind.stats(dir = "data/VCF", vcf = "SFL.F10") %>%
separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)
loc_stats_F10 <- read.loc.stats(dir = "data/VCF/", vcf = "SFL.F10")
# plot missing data per indv ----
p1 <- ggplot(ind_stats_F10, aes(x = MISS_SFL.F10)) +
geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(MISS_SFL.F10, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 0.5),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "missing data per indv") +
theme_standard
# plot read depth per indv ----
p2 <- ggplot(ind_stats_F10, aes(x = MEAN_DEPTH_SFL.F10)) +
geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(MEAN_DEPTH_SFL.F10, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 20),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "mean read depth per indv") +
theme_standard
# plot depth vs missing ----
p3 <- ggplot(ind_stats_F10, aes(x = MEAN_DEPTH_SFL.F10, y = MISS_SFL.F10)) +
geom_point() +
geom_vline(aes(xintercept = mean(MEAN_DEPTH_SFL.F10, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 20),
color = "darkblue", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = mean(MISS_SFL.F10, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = 0.5),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "mean depth per indv", y = "% missing data") +
theme_standard
# plot distribution missing data per locus ----
p4 <- ggplot(loc_stats_F10, aes(x = MISS_SFL.F10)) +
geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(MISS_SFL.F10, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 0.1),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "% missing data per locus") +
theme_standard
# plot distribution mean read depth ----
p5 <- ggplot(loc_stats_F10, aes(x = MEAN_DEPTH_SFL.F10)) +
geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(MEAN_DEPTH_SFL.F10, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 20),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "mean read depth per locus") +
theme_standard
# plot read depth vs missing data ----
p6 <- ggplot(loc_stats_F10, aes(x = MEAN_DEPTH_SFL.F10, y = MISS_SFL.F10)) +
geom_point() +
geom_vline(aes(xintercept = mean(MEAN_DEPTH_SFL.F10, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 20),
color = "darkblue", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = mean(MISS_SFL.F10, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = 0.1),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "mean depth per locus", y = "% missing data") +
theme_standard
m3 <- multiplot(p1, p2, p3, p4, p5, p6, cols=2)Data set contains 70608 SNP sites and 361 individuals.
Assess individuals in terms of low coverage, high missing data, and frequency burden (i.e. distribution of the number of variants within each individual of a specific frequency).
vcftools --vcf data/VCF/temp/SFL.F10.recode.vcf --out data/VCF/temp/SFL.F11 --min-meanDP 15 --max-missing 0.9 --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/SFL.F11.recode.vcf --out data/VCF/SFL.F11 --depth
vcftools --vcf data/VCF/temp/SFL.F11.recode.vcf --out data/VCF/SFL.F11 --site-mean-depth
vcftools --vcf data/VCF/temp/SFL.F11.recode.vcf --out data/VCF/SFL.F11 --missing-indv
vcftools --vcf data/VCF/temp/SFL.F11.recode.vcf --out data/VCF/SFL.F11 --missing-site
vcftools --vcf data/VCF/temp/SFL.F11.recode.vcf --out data/VCF/SFL.F11 --het
vcftools --vcf data/VCF/temp/SFL.F11.recode.vcf --out data/VCF/SFL.F11 --hardyIdentify SNPs with more than 0.5 heterozygosity and significant.
# calculate observed heterozygosity
hwe <- read.table("data/VCF/SFL.F11.hwe",
stringsAsFactors = FALSE, header = TRUE) %>%
select(-ChiSq_HWE) %>%
separate(`OBS.HOM1.HET.HOM2.`,
into = c("obs_hom1", "obs_het", "obs_hom2"),
sep = "/", convert = TRUE) %>%
separate(`E.HOM1.HET.HOM2.`,
into = c("exp_hom1", "exp_het", "exp_hom2"),
sep = "/", convert = TRUE) %>%
mutate(Ho = obs_het/(obs_hom1 + obs_hom2 + obs_het),
He = exp_het/(exp_hom1 + exp_hom2 + exp_het)) %>%
select(CHR, POS, obs_hom1, exp_hom1, obs_hom2, exp_hom2, P_HET_DEFICIT, obs_het, exp_het, P_HET_EXCESS, Ho, He, P_HWE)Distribution of observed heterozygosity.
ggplot(hwe, aes(x = Ho)) +
geom_histogram(binwidth = 0.05, color = "black", fill = "darkorange") +
geom_vline(xintercept = 0.5, color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "observed heterozygosity") +
theme_standardIdentify loci with heterozygosity > 0.5, then correct p-values for multiple comparisons using Benjamini-Hochberg method.
# identify contigs with SNPs with Ho > 0.5 & significant
excess_hwe <- hwe %>%
filter(Ho > 0.5) %>%
mutate(p_adj = p.adjust(P_HET_EXCESS), method = "fdr") %>%
filter(p_adj < 0.05)
contigs <- hwe %>%
filter(CHR %in% excess_hwe$CHR) %>%
select(CHR) %>%
unique()
# identify all SNPs on contigs
SNPs <- hwe %>%
filter(CHR %in% contigs$CHR) %>%
select(CHR, POS)
# Write contig/position to text file, use file with vcftools to remove positions from dataset
write.table(SNPs, file = "data/VCF/hetexcess.loci",
col.names= FALSE, row.names = FALSE, quote = FALSE)Remove SNPs with excess heterozygosity and contigs with more than one SNP w/excess heterozygosity.
vcftools --vcf data/VCF/temp/SFL.F11.recode.vcf --out data/VCF/temp/SFL.F12 --exclude-positions data/VCF/hetexcess.loci --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/SFL.F12.recode.vcf --out data/VCF/SFL.F12 --depth
vcftools --vcf data/VCF/temp/SFL.F12.recode.vcf --out data/VCF/SFL.F12 --site-mean-depth
vcftools --vcf data/VCF/temp/SFL.F12.recode.vcf --out data/VCF/SFL.F12 --missing-indv
vcftools --vcf data/VCF/temp/SFL.F12.recode.vcf --out data/VCF/SFL.F12 --missing-site
vcftools --vcf data/VCF/temp/SFL.F12.recode.vcf --out data/VCF/SFL.F12 --het
vcftools --vcf data/VCF/temp/SFL.F12.recode.vcf --out data/VCF/SFL.F12 --geno-depthVisualize stats:
# load stats files ----
ind_stats_F12 <- read.ind.stats(dir = "data/VCF", vcf = "SFL.F12") %>%
separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)
loc_stats_F12 <- read.loc.stats(dir = "data/VCF/", vcf = "SFL.F12")Data set contains 56948 SNP sites and 361 individuals.
Compare mean depth and number of sites called per individual.
read_table2("data/VCF/SFL.F12.idepth") %>%
ggplot(aes(x = N_SITES, y = MEAN_DEPTH)) +
geom_point(shape = 1, size = 1) +
labs(x = "number of sites", y = "mean read depth") +
theme_standardUse genotype depth file to identify individuals with high variance in read depth across loci.
# number of individuals
n <- nrow(ind_stats_F12)+1
# read genotype depth & code values < 5 as missing, retain only one locus per contig
gdepth <- read_table2("data/VCF/SFL.F12.gdepth") %>%
select(-POS) %>%
distinct(CHROM, .keep_all = TRUE) %>%
gather(key = INDV, value = GDEPTH, 2:n) %>%
mutate(GDEPTH = as.numeric(gsub(-1, 0, GDEPTH))) %>%
mutate(DEPTH = GDEPTH) %>%
mutate(DEPTH = as.numeric(gsub("\\<1\\>", 0, DEPTH))) %>%
mutate(DEPTH = as.numeric(gsub("\\<2\\>", 0, DEPTH))) %>%
# mutate(DEPTH = as.numeric(gsub("\\<3\\>", 0, DEPTH))) %>%
# mutate(DEPTH = as.numeric(gsub("\\<4\\>", 0, DEPTH))) %>%
mutate(MISSING = ifelse(DEPTH == 0, "missing", "genotyped")) %>%
mutate(GDEPTHBIN = cut(GDEPTH,
breaks = c(0, 3, 10, 25, 50, 100, 500, max(GDEPTH)),
labels = c("<3", "3-10", "10-25", "25-50", "50-100", "100-500", ">500"))) %>%
separate(INDV, into = c("POP", "LIB", "temp"), sep = "_", remove = FALSE) %>%
mutate(LIB = str_replace_all(LIB, "4A.*", "SFL4"),
LIB = str_replace_all(LIB, "4B.*", "SFL4"),
LIB = str_replace_all(LIB, "5A.*", "SFL5"),
LIB = str_replace_all(LIB, "5B.*", "SFL5"),
LIB = str_replace_all(LIB, "6A.*", "SFL6"),
LIB = str_replace_all(LIB, "6B.*", "SFL6")) %>%
select(-temp)Compare distribution of depth.
ggplot(gdepth, aes(x = INDV, y = CHROM)) +
geom_tile(aes(fill = GDEPTHBIN)) +
scale_fill_viridis(direction = -1, option = "viridis",
name = 'genotype depth',
discrete = TRUE) +
facet_grid(. ~ POP, space = "free", scales = "free", drop = TRUE) +
labs(x = "individual", y = "locus") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom")Compare distribution of depth of individuals grouped by library.
ggplot(gdepth, aes(x = INDV, y = CHROM)) +
geom_tile(aes(fill = GDEPTHBIN)) +
scale_fill_viridis(direction = -1, option = "viridis",
name = 'genotype depth',
discrete = TRUE) +
facet_grid(. ~ LIB, space = "free", scales = "free", drop = TRUE) +
labs(x = "individual", y = "locus") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom")Compare distribution of genotype depths (across all individuals).
temp <- count(gdepth, GDEPTHBIN)
ggplot(temp, aes(x = GDEPTHBIN, y = n)) +
geom_bar(stat= "identity", color = "black", fill = "darkorange") +
theme_standardIdentify variance in depth w/in individuals.
# calculate summary statistics
idepth <- gdepth %>%
group_by(INDV) %>%
summarize(TOTAL = sum(DEPTH),
MAX = max(DEPTH),
MEAN = mean(DEPTH),
MEDIAN = median(DEPTH),
VAR = var(DEPTH),
STD = sd(DEPTH)) %>%
mutate(COEFF_VAR = STD/MEAN,
RATIO_MEAN_MEDIAN = MEAN/MEDIAN)
p1 <- ggplot(idepth, aes(x = MEAN, y = MEDIAN)) +
geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
geom_point(shape = 1) +
theme_standard
p2 <- ggplot(idepth, aes(x = MAX, y = TOTAL)) +
geom_smooth(method = "lm",linetype = "dashed", size = 1, color = "darkred") +
geom_point(shape = 1) +
theme_standard
p3 <- ggplot(idepth, aes(x = TOTAL, y = VAR)) +
geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
geom_point(shape = 1) +
theme_standard
p4 <- ggplot(idepth, aes(MEDIAN)) +
geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
theme_standard
p5 <- ggplot(idepth, aes(RATIO_MEAN_MEDIAN)) +
geom_histogram(binwidth = 0.1, color = "black", fill = "darkorange") +
theme_standard
p6 <- ggplot(idepth, aes(COEFF_VAR)) +
geom_histogram(binwidth = 0.05, color = "black", fill = "darkorange") +
theme_standard
m14b <- multiplot(p1, p2, p3, p4, p5, p6, cols=2)Compare depth, missing data and individual heterozygosity levels.
imiss <- read_table2("data/VCF/SFL.F12.imiss") %>%
select(INDV, N_DATA, F_MISS)
istats <- left_join(idepth, imiss)
Fis <- read_table2("data/VCF/SFL.F12.het") %>%
select(-N_SITES)
istats <- left_join(istats, Fis)
p1 <- ggplot(istats, aes(x = MEAN, y = F_MISS, fill = `F`)) +
geom_point(shape = 21, size = 2, color = "black") +
scale_fill_viridis(direction = 1, option = "viridis",
name = 'Fis', discrete = FALSE) +
theme_standard
p2 <- ggplot(istats, aes(x = COEFF_VAR, y = MEAN, fill = F_MISS)) +
geom_point(shape = 21, size = 2, color = "black") +
scale_fill_viridis(direction = -1, option = "viridis",
name = '% missing', discrete = FALSE) +
theme_standard
p3 <- ggplot(istats, aes(x = `O(HOM)`, y = MEAN, fill = F_MISS)) +
geom_point(shape = 21, size = 2, color = "black") +
scale_fill_viridis(direction = -1, option = "viridis",
name = 'mean read depth', discrete = FALSE) +
theme_standard
p4 <- ggplot(istats, aes(x = MEAN)) +
geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
theme_standard
p5 <- ggplot(istats, aes(x = `F`, y = MEAN, fill = F_MISS)) +
geom_point(shape = 21, size = 2, color = "black") +
scale_fill_viridis(direction = -1, option = "viridis",
name = '% missing', discrete = FALSE) +
theme_standard
p6 <- ggplot(istats, aes(x = `F`, y = COEFF_VAR, fill = F_MISS)) +
geom_point(shape = 21, size = 2, color = "black") +
scale_fill_viridis(direction = -1, option = "viridis",
name = '% missing', discrete = FALSE) +
theme_standard
p7 <- ggplot(istats, aes(x = `F`, y = RATIO_MEAN_MEDIAN, fill = F_MISS)) +
geom_point(shape = 21, size = 2, color = "black") +
scale_fill_viridis(direction = -1, option = "viridis",
name = '% missing', discrete = FALSE) +
scale_y_continuous(limits = c(0.8, 4)) +
theme_standard
p8 <- ggplot(istats, aes(x = F_MISS)) +
geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
theme_standard
m14b <- multiplot(p1, p2, p3, p4, p5, p6, p7, p8, cols=2)Flag low quality individuals
View(istats)
LQ_ind <- istats %>%
filter(`F` > .4 | `F` < -.9) %>%
select(INDV)
write.table(LQ_ind, "data/VCF/F13_LQ.indv",
col.names = TRUE, row.names = FALSE, quote = FALSE)
vcftools --vcf data/VCF/temp/SFL.F12.recode.vcf --out data/VCF/temp/SFL.F13 --remove data/VCF/F13_LQ.indv --recode --recode-INFO-all
# Alabama
vcftools --vcf data/VCF/temp/SFL.F13.recode.vcf --out data/VCF/temp/AL --keep data/VCF/AL.ind --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/AL.recode.vcf --out data/VCF/AL --missing-site
# Florida Atlantic
vcftools --vcf data/VCF/temp/SFL.F13.recode.vcf --out data/VCF/temp/FLA --keep data/VCF/FLA.ind --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/FLA.recode.vcf --out data/VCF/FLA --missing-site
# Florida Gulf
vcftools --vcf data/VCF/temp/SFL.F13.recode.vcf --out data/VCF/temp/FLG --keep data/VCF/FLG.ind --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/FLG.recode.vcf --out data/VCF/FLG --missing-site
# Louisiana
vcftools --vcf data/VCF/temp/SFL.F13.recode.vcf --out data/VCF/temp/LA --keep data/VCF/LA.ind --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/LA.recode.vcf --out data/VCF/LA --missing-site
# Mississippi
vcftools --vcf data/VCF/temp/SFL.F13.recode.vcf --out data/VCF/temp/MS --keep data/VCF/MS.ind --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/MS.recode.vcf --out data/VCF/MS --missing-site
# South Carolina
vcftools --vcf data/VCF/temp/SFL.F13.recode.vcf --out data/VCF/temp/SC --keep data/VCF/SC.ind --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/SC.recode.vcf --out data/VCF/SC --missing-site
# Texas
vcftools --vcf data/VCF/temp/SFL.F13.recode.vcf --out data/VCF/temp/TX --keep data/VCF/TX.ind --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/TX.recode.vcf --out data/VCF/TX --missing-siteCompare missing data per sample region:
# create empty list
loc_missing <- list()
# import missing data per locus
loc_missing[[1]] <- read.table("data/VCF/AL.lmiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(CHR, POS, F_MISS) %>%
mutate(POP = "AL")
loc_missing[[2]] <- read.table("data/VCF/FLA.lmiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(CHR, POS, F_MISS) %>%
mutate(POP = "FLA")
loc_missing[[3]] <- read.table("data/VCF/FLG.lmiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(CHR, POS, F_MISS) %>%
mutate(POP = "FLG")
loc_missing[[4]] <- read.table("data/VCF/LA.lmiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(CHR, POS, F_MISS) %>%
mutate(POP = "LA")
loc_missing[[5]] <- read.table("data/VCF/MS.lmiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(CHR, POS, F_MISS) %>%
mutate(POP = "MS")
loc_missing[[6]] <- read.table("data/VCF/SC.lmiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(CHR, POS, F_MISS) %>%
mutate(POP = "SC")
loc_missing[[7]] <- read.table("data/VCF/TX.lmiss",
header = TRUE, stringsAsFactors = FALSE) %>%
select(CHR, POS, F_MISS) %>%
mutate(POP = "TX")
# create data frame with all information
loc_missing <- ldply(loc_missing, data.frame)
ggplot(loc_missing, aes(x = F_MISS)) +
geom_histogram(binwidth = .025, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = 0.15),
color = "darkred", linetype = "dashed", size = 1) +
labs(x = "missing data per locus") +
facet_grid(POP ~ .) +
theme_standardidentify loci with high missing data in each sample location
# identify loci with high missing data in each region
SNPs <- filter(loc_missing, F_MISS > 0.2)
count(SNPs, POP)
contigs <- SNPs %>%
distinct(CHR)
SNPs <- SNPs %>%
select(CHR, POS)
# Write contig/position to text file, use file with vcftools to remove positions from dataset
write.table(SNPs, file = "data/VCF/LQ_F13.loci",
col.names= FALSE, row.names = FALSE, quote = FALSE)Remove loci from data set:
vcftools --vcf data/VCF/temp/SFL.F13.recode.vcf --out data/VCF/temp/SFL.F14 --exclude-positions data/VCF/LQ_F13.loci --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/SFL.F14.recode.vcf --out data/VCF/SFL.F14 --site-quality
vcftools --vcf data/VCF/temp/SFL.F14.recode.vcf --out data/VCF/SFL.F14 --depth
vcftools --vcf data/VCF/temp/SFL.F14.recode.vcf --out data/VCF/SFL.F14 --site-mean-depth
vcftools --vcf data/VCF/temp/SFL.F14.recode.vcf --out data/VCF/SFL.F14 --missing-indv
vcftools --vcf data/VCF/temp/SFL.F14.recode.vcf --out data/VCF/SFL.F14 --missing-site
vcftools --vcf data/VCF/temp/SFL.F14.recode.vcf --out data/VCF/SFL.F14 --het
vcftools --vcf data/VCF/temp/SFL.F14.recode.vcf --out data/VCF/SFL.F14 --geno-depth
vcftools --vcf data/VCF/temp/SFL.F14.recode.vcf --out data/VCF/SFL.F14 --singletons
vcftools --vcf data/VCF/temp/SFL.F14.recode.vcf --out data/VCF/SFL.F14 --indv-freq-burden
vcftools --vcf data/VCF/temp/SFL.F14.recode.vcf --out data/VCF/SFL.F14 --freqCompare stats:
# load stats files ----
ind_stats_F14 <- read.ind.stats(dir = "data/VCF", vcf = "SFL.F14") %>%
separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)
loc_stats_F14 <- read.loc.stats(dir = "data/VCF", vcf = "SFL.F14")
# plot missing data per indv ----
p1 <- ggplot(ind_stats_F14, aes(x = MISS_SFL.F14)) +
geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(MISS_SFL.F14, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 0.5),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "missing data per indv") +
theme_standard
# plot read depth per indv ----
p2 <- ggplot(ind_stats_F14, aes(x = MEAN_DEPTH_SFL.F14)) +
geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(MEAN_DEPTH_SFL.F14, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 20),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "mean read depth per indv") +
theme_standard
# plot depth vs missing ----
p3 <- ggplot(ind_stats_F14, aes(x = MEAN_DEPTH_SFL.F14, y = MISS_SFL.F14)) +
geom_point() +
geom_vline(aes(xintercept = mean(MEAN_DEPTH_SFL.F14, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 20),
color = "darkblue", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = mean(MISS_SFL.F14, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = 0.5),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "mean depth per indv", y = "% missing data") +
theme_standard
# plot Fis per indv ----
p4 <- ggplot(ind_stats_F14, aes(x = Fis_SFL.F14)) +
geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(Fis_SFL.F14, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 0),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "Fis per indv") +
theme_standard
# plot Fis vs missing data per indv ----
p5 <- ggplot(ind_stats_F14, aes(x = Fis_SFL.F14, y = MISS_SFL.F14)) +
geom_point() +
geom_vline(aes(xintercept = mean(Fis_SFL.F14, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 0),
color = "darkblue", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = mean(MISS_SFL.F14, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = 0.5),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "Fis per indv", y = "% missing data") +
theme_standard
# plot Fis vs mean depth per indv ----
p6 <- ggplot(ind_stats_F14, aes(x = Fis_SFL.F14, y = MEAN_DEPTH_SFL.F14)) +
geom_point() +
geom_vline(aes(xintercept = mean(Fis_SFL.F14, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 0),
color = "darkblue", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = mean(MEAN_DEPTH_SFL.F14, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = 20),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "Fis per indv", y = "mean depth per indv") +
theme_standard
# plot distribution missing data per locus ----
p7 <- ggplot(loc_stats_F14, aes(x = MISS_SFL.F14)) +
geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(MISS_SFL.F14, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 0.1),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "% missing data per locus") +
theme_standard
# plot distribution mean read depth ----
p8 <- ggplot(loc_stats_F14, aes(x = MEAN_DEPTH_SFL.F14)) +
geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(MEAN_DEPTH_SFL.F14, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 20),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "mean read depth per locus") +
theme_standard
# plot read depth vs missing data ----
p9 <- ggplot(loc_stats_F14, aes(x = MEAN_DEPTH_SFL.F14, y = MISS_SFL.F14)) +
geom_point() +
geom_vline(aes(xintercept = mean(MEAN_DEPTH_SFL.F14, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 20),
color = "darkblue", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = mean(MISS_SFL.F14, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = 0.1),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "mean depth per locus", y = "% missing data") +
theme_standard
# plot depth vs SNP quality ----
site_qual <- read.table("data/VCF/SFL.F14.lqual",
header = TRUE, stringsAsFactors = FALSE) %>%
mutate(PROB = 10^(-QUAL/10))
temp <- data.frame(loc_stats_F14$MEAN_DEPTH_SFL.F14, site_qual$QUAL) %>%
rename(depth = loc_stats_F14.MEAN_DEPTH_SFL.F14, qual = site_qual.QUAL)
p10 <- ggplot(temp, aes(x = depth, y = qual)) +
geom_point(size = 1) +
geom_vline(aes(xintercept = mean(depth, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 20),
color = "darkblue", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = mean(qual, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = 20),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "mean depth per locus", y = "SNP quality") +
theme_standard
# plot number of SNPs per contig vs. mean depth ----
temp <- loc_stats_F14 %>%
count(CHR)
p11 <- left_join(temp, loc_stats_F14) %>%
ggplot() +
geom_point(aes(x = n, y = MEAN_DEPTH_SFL.F14)) +
labs(x = "number of SNPs per contig", y = "mean depth") +
theme_standard
# plot no of SNPs per locus ----
p12 <- loc_stats_F14 %>%
count(CHR) %>%
ggplot(aes(x = n)) +
geom_histogram(binwidth = 1, color = "black", fill = "darkorange") +
labs(x = "number of SNPs per locus") +
theme_standard
m15 <- multiplot(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, cols=2)Compare number of contigs vs number of SNPs.
# number of SNPs
nrow(loc_stats_F14)[1] 52973
# number of contigs in data set
contigs <- loc_stats_F14 %>%
distinct(CHR)
nrow(contigs)[1] 6174
Mean number of SNPs per contig is 341.7612903.
istats <- left_join(ind_stats_raw, ind_stats_F14)Data set contains 52973 SNP sites on 155 and 353 individuals.
Create list of individuals retained in final vcf file:
Change so that new file is printed into Haplotyping/temp folder
vcftools --vcf data/VCF/temp/SFL.F14.recode.vcf --out data/Haplotyping/temp/SFL --recode --recode-INFO-all
vcfsamplenames data/Haplotyping/temp/SFL.recode.vcf > data/Haplotyping/temp/SFL.individualsUse SFL-SFL.individuals to create popmap as a tab-separated file of individuals and their population designation, with one individual per line (make sure UNIX format). This file is needed to write the genepop file, if not provided the script will run through the process but not write a genepop file and place into same folder rad_haplotyper.pl will be run from.
popmap <- read.table("data/Haplotyping/temp/SFL.individuals",
col.names = "INDV", stringsAsFactors = FALSE) %>%
separate(INDV, into = c("POP", "LIB", "ID"), sep = "_", remove = FALSE) %>%
select(INDV, POP)
write.table(popmap, "data/Haplotyping/temp/popmap",
col.names = FALSE, row.names = FALSE, quote = FALSE)Place all necessary files in data/Haplotyping/temp directory (bam, fastq, reference.fasta, vcf=file)
Copy files for analysis into data/Haplotyping/-folder
cd /home/soleary/FLOUNDER/SFL_PopGen/data/Haplotyping/temp
rm *.fq.gz
rm *.bam*
cp /home/soleary/FLOUNDER/SFL_PopGen/data/Haplotyping/temp/ind_stats.out /home/soleary/FLOUNDER/SFL_PopGen/data/Haplotyping/
cp /home/soleary/FLOUNDER/SFL_PopGen/data/Haplotyping/temp/stats.out /home/soleary/FLOUNDER/SFL_PopGen/data/Haplotyping/
cp /home/soleary/FLOUNDER/SFL_PopGen/data/Haplotyping/temp/SFL.gen /home/soleary/FLOUNDER/SFL_PopGen/data/Haplotyping/Comparison of the number of loci that were filtered due to excess number of missing data or suspected paralogs during the haplotyping process to those that passed. Haplotyer was run without any threshold values (other than default values). The genepop file only contains loci that passed haplotyping stage.
# import stats files generated by haplotyper
hap_stats <- read.hap.stats("data/Haplotyping/stats.out")
hap_ind_stats <- read.delim("data/Haplotyping/ind_stats.out",
stringsAsFactors = FALSE)
# view comparison of filtered vs. passed loci
plot.filter.status(hap_stats)Remove filtered loci from data set.
count(hap_stats, Status == "FILTERED")hap_stats <- remove.filtered.haps(hap_stats)6174 loci passed haplotyping process.
plot.hap.stats(hap_stats)Data set contains 353 individuals:
plot.ind.hap.stats(hap_ind_stats)hap_stats <- read.hap.stats("data/Haplotyping/stats.out")
p1 <- ggplot(hap_stats, aes(x = Prop_Haplotyped)) +
geom_histogram(binwidth = 0.025, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(Prop_Haplotyped, na.rm = TRUE)),
color = "darkred", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 0.9),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "proportion individuals haplotyped") +
theme_standard
p2 <- ggplot(hap_stats, aes(x = Prop_Haplotyped, y = Poss_Paralog)) +
geom_point(shape = 21, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(Prop_Haplotyped, na.rm = TRUE)),
color = "darkred", linetype = "dashed", size = 1) +
labs(x = "proportion individuals haplotyped", y = "possible paralogs") +
theme_standard
p3 <- ggplot(hap_stats, aes(x = Prop_Haplotyped, y = `Low_Cov.Geno_Err`)) +
geom_point(shape = 21, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(Prop_Haplotyped, na.rm = TRUE)),
color = "darkred", linetype = "dashed", size = 1) +
labs(x = "proportion individuals haplotyped", y = "low coverage error") +
theme_standard
multiplot(p1, p2, p3, cols = 3)Flag loci successfully haplotyped in < 90% of individuals.
# number of loci called in >95% individuals
count(hap_stats, Prop_Haplotyped < 0.9)# create vector of loci to remove (choose cut-off)
Prop_Haplotyped <- filter.loci.prop_haplotyped(hap_stats, 0.9)1276 loci were flagged as genotype call rate < 0.9.
Loci are flagged as possible paralogs for an individuals when more than the expected number of haplotypes based on the SNP genotype call (homozygote, heterozygote) are detected.
ggplot(hap_stats, aes(x = Poss_Paralog)) +
geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(Poss_Paralog, na.rm = TRUE)),
color = "darkred", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = (nrow(hap_ind_stats)*0.01)),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "possible paralogs per locus") +
theme_standard1 % of individuals: 3.53 5 % of individuals: 17.65
Flag loci that are flagged as potential paralogs in 3 or more individuals.
# number of loci with Poss_Paralogs
count(hap_stats, Poss_Paralog > 3)# create vector of loci to remove (choose cut-off)
Poss_Paralogs <- filter.loci.paralogs(hap_stats, 3)1558 loci were flagged as possible paralogs.
Each locus varies in the number of SNPs detected which determines the number of haplotypes expected in that population.
ggplot(hap_stats, aes(x = Sites)) +
geom_histogram(binwidth = 1, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(Sites, na.rm = TRUE)),
color = "darkred", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 25),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "number of SNPs per contig") +
theme_standardFiltering loci based on number of SNPs contained on that locus could bias the data set as loci with high recombination may be removed. On the other hand, assuming an approximate length of 250 bp loci with more than 25 SNPs would mean that 10% of bases are a polymorphisms.
p1 <- ggplot(hap_stats, aes(x = Haplotypes, y = Poss_Paralog)) +
geom_point(shape = 21, color = "black", fill = "darkorange") +
geom_smooth(method = "lm") +
labs(x = "no. haplotypes", y = "possible paralogs") +
theme_standard
p2 <- ggplot(hap_stats, aes(x = Sites, y = Poss_Paralog)) +
geom_point(shape = 21, color = "black", fill = "darkorange") +
geom_smooth(method = "lm") +
labs(x = "no. of sites", y = " ") +
theme_standard
multiplot(p1, p2, cols = 2)Identify number of haplotypes loci with > 35 SNP sites.
count(hap_stats, Sites > 35)NoSNps <- filter.loci.noSNPs(hap_stats, 35)1 loci were flagged.
Assuming that mutation is the only mechanism resulting in new haplotypes, the maximum expected number of haplotypes per locus is number of SNPs N + 1.
p1 <- ggplot(hap_stats, aes(x = Haplotypes)) +
geom_histogram(binwidth = 1, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(Haplotypes, na.rm = TRUE)),
color = "darkred", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = quantile(Haplotypes, .99, na.rm = TRUE)),
color = "darkred", linetype = "dashed", size = 1) +
labs(x = "# haplotypes per locus", y = "# loci") +
theme_standard
temp <- hap_stats %>%
select(Locus, Sites, Haplotypes, Prop_Haplotyped, Low_Cov.Geno_Err, Poss_Paralog) %>%
mutate(exp_sites = Sites + 1) %>%
mutate(xtra = Haplotypes - Sites)
p2 <- ggplot(temp, aes(x = Sites, y = Poss_Paralog)) +
geom_point() +
geom_hline(yintercept = 5, color = "darkblue", linetype = "dashed", size = 1) +
geom_vline(xintercept = 25, color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "# SNP sites per locus", y = "possible paralogs") +
theme_standard
p3 <- ggplot(temp, aes(x = Sites, y = xtra)) +
geom_point() +
geom_hline(yintercept = 10, color = "darkblue", linetype = "dashed", size = 1) +
geom_vline(xintercept = 25, color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "# SNP sites per locus", y = "# extra haplotypes") +
theme_standard
p4 <- ggplot(temp, aes(x = xtra, y = Prop_Haplotyped)) +
geom_point(size = 1) +
geom_hline(yintercept = 0.9, color = "darkblue", linetype = "dashed", size = 1) +
geom_vline(xintercept = 25, color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "# extra haplotypes", y = "proportion of indv haplotyped") +
theme_standard
p5 <- ggplot(temp, aes(x = xtra, y = Low_Cov.Geno_Err)) +
geom_point(size = 1) +
geom_hline(yintercept = 5, color = "darkblue", linetype = "dashed", size = 1) +
geom_vline(xintercept = 25, color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "# extra haplotypes", y = "potential genotyping error") +
theme_standard
p6 <- ggplot(temp, aes(x = xtra, y = Poss_Paralog)) +
geom_point(size = 1) +
geom_hline(yintercept = 5, color = "darkblue", linetype = "dashed", size = 1) +
geom_vline(xintercept = 25, color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "# extra haplotypes", y = "possible paralogs") +
theme_standard
m21 <- multiplot(p1, p2, p3, p4, p5, p6, cols = 2)Again, removing loci with unexpectedly high numbers of haplotypes may bias the data set, as loci with e.g. high recombination may be removed from the data set.
count(temp, xtra >= 50)# create vector of loci to remove
NoHaps <- filter(temp, xtra >= 50)
NoHaps <- NoHaps$Locus26 loci were flagged for excessive number of haplotypes compared to the number of SNPs at that locus.
After combining SNPs on the same contig during the haplotyping process, it is possible to observe fewer than the expected number of haplotypes based on the genotype call (heterozygote/homozygote). When this occurs, that genotype is coded as missing. For each locus the number of individuals for which is it flagged as a potential a potential genotyping error or suffering from null alleles due to low coverage detected for a given locus is recorded.
ggplot(hap_stats, aes(x = Low_Cov.Geno_Err)) +
geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(Low_Cov.Geno_Err, na.rm = TRUE)),
color = "darkred", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = (nrow(hap_ind_stats)*0.05)),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "potential genotyping error") +
theme_standardLoci with pronounced patterns of genotyping error likely due to low coverage will have low success rate for genotyping, i.e. they will be caught in missing data filters. Coverage issues are likely genotype specfic; previous filters have targeted loci and individuals with high variance in coverage and suspect genotpes have been coded as missing, i.e. this filter need not be very conservative, regardless, loci that are repeatedly being flagged as problematic should be removed.
# summary of count of possible genotyping errors
count(hap_stats, Low_Cov.Geno_Err > 32)# create vector of loci to remove
Geno_Err <- filter.loci.geno_err(hap_stats, 32)320 loci were flagged as potentially affected by genotyping error.
Loci that are not successfully haplotyped in an individual due to missing data, complex locus, haplotyped genotype is higher/lower than SNP haplotype in a given individual are coded as missing for that individual. Problematic individual can be identified as having a high proportion of missing data.
Individuals with low proportion of successfully haplotyped loci
ggplot(hap_ind_stats, aes(x = Prop_Success)) +
geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(Prop_Success, na.rm = TRUE)),
color = "darkred", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 0.75),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "proportion loci successfully haplotyped") +
theme_standardProblem individuals can be identified by choosing a cut-off value for the minium proportion of sucessfully haplotyped loci and excluding individuals below that threshold value. Removing loci with low haplotyping success with decrease the amount of missing data. Therefore, final missing data cut-offs should be applied after removing those loci from the data set, though it is important to flag potentially problematic loci based on their haplotyping stats at this point.
# count individuals above set threshold
count(hap_ind_stats, Prop_Success < .75)# create vector of indv to remove (choose cut-off)
PropSuccessInd <- filter.ind.prop_success(hap_ind_stats, .75)
write.table(PropSuccessInd, "data/POPGEN/propSucc.indv",
col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t")
View(as.data.frame(PropSuccessInd))Possible paralogs per individual
Individuals with high proportion of loci flagged as possible paralogs should be flagged as potential problem individuals.
ggplot(hap_ind_stats, aes(x = Poss_Paralogs)) +
geom_histogram(binwidth = 25, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(Poss_Paralogs, na.rm = TRUE)),
color = "darkred", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = (nrow(hap_stats)*0.10)),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "no of loci potential paralogs") +
theme_standardCut-off for individuals with loci flagged paralogs in > 5% of loci is 308.7.
# count individuals above set threshold
count(hap_ind_stats, Poss_Paralogs > 300)# create vector of indv to remove (choose cut-off)
Poss_ParalogInd <- filter.ind.paralogs(hap_ind_stats, 300)
write.table(Poss_ParalogInd, "data/POPGEN/possParal.indv",
col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t")
View(as.data.frame(Poss_ParalogInd))The highest number of flagged loci in an individuals is 378, which is equivalent to 6.12% of loci.
Individuals with high proportion of potential allele dropout/genotyping error
ggplot(hap_ind_stats, aes(x = Low_Coverage.Errors)) +
geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(Low_Coverage.Errors, na.rm = TRUE)),
color = "darkred", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = (nrow(hap_stats)*0.05)),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "# loci potential genotyping error", y = "# indv") +
theme_standardRemove individuals with high proportion of loci that have been flagged as potential allele dropouts/genotyping error.
# count individuals above set threshold
count(hap_ind_stats, Low_Coverage.Errors > 300)# create vector of indv to remove (choose cut-off)
GenoErrIndv <- filter.ind.geno_err(hap_ind_stats, 300)
write.table(GenoErrIndv, "data/POPGEN/GenoErr.indv",
col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t")
View(GenoErrIndv)The highest number of flagged loci in an individuals is 246, which is equivalent to 3.98% of loci.
Load genepop file with haplotyped loci.
# import genepop file
gen <- read.genepop(file = "data/Haplotyping/SFL.gen",
ncode = 3L, quiet = FALSE)
Converting data from a Genepop .gen file to a genind object...
File description: SFL.gen
...done.
gen/// GENIND OBJECT /////////
// 353 individuals; 5,988 loci; 79,487 alleles; size: 119.4 Mb
// Basic content
@tab: 353 x 79487 matrix of allele counts
@loc.n.all: number of alleles per locus (range: 1-116)
@loc.fac: locus factor for the 79487 columns of @tab
@all.names: list of allele names for each locus
@ploidy: ploidy of each individual (range: 2-2)
@type: codom
@call: read.genepop(file = "data/Haplotyping/SFL.gen", ncode = 3L, quiet = FALSE)
// Optional content
@pop: population of each individual (group size range: 4-71)
Unfiltered haplotyped data set contains 317 individuals and 3558 loci.
# remove flagged loci
removeloc <- c(Poss_Paralogs, NoSNps, NoHaps, Geno_Err, Prop_Haplotyped)
gen <- genind.rem.loci(gen, removeloc)
# remove flagged individuals
removeInd <- c(Poss_ParalogInd, PropSuccessInd, GenoErrIndv, "SC_4AE05_Ple-01197")
gen <- gen.ind.rem.Ind(gen, removeInd)
gen/// GENIND OBJECT /////////
// 342 individuals; 3,676 loci; 35,586 alleles; size: 52.3 Mb
// Basic content
@tab: 342 x 35586 matrix of allele counts
@loc.n.all: number of alleles per locus (range: 1-66)
@loc.fac: locus factor for the 35586 columns of @tab
@all.names: list of allele names for each locus
@ploidy: ploidy of each individual (range: 2-2)
@type: codom
@call: .local(x = x, i = i, j = j, drop = drop)
// Optional content
@pop: population of each individual (group size range: 4-71)
After filtering data set contains 317 individuals and 3558 loci.
# extract genotype matrix
df <- genind2df(gen,
usepop = TRUE,
sep = ":", oneColPerAll = FALSE) %>%
select(-pop) %>%
rownames_to_column()
# create list of duplicates
pairs <- list()
# input each set of duplicates as a vector of a list
pairs[[1]] <- c("TX_4BD06_SL-SF-2016-08", "TX_4BD07_SL-SF-2016-09")
pairs[[2]] <- c("TX_4BD02_SL-SF-2016-04", "TX_4BD04_SL-SF-2016-06")
pairs[[3]] <- c("TX_6AG10_SL-SF-2016-42", "TX_6AG11_SL-SF-2016-43")
pairs[[4]] <- c("FLA_4AF12_FL-SF-2016-103", "FLA_5AA07_FL-SF-2016-103")
pairs[[5]] <- c("TX_5BF02_SL-SF-2016-39", "TX_6AG12_SL-SF-2016-44")
pairs[[6]] <- c("FL_4AC03_FL-SF-2016-63", "FL_4BA09_FL-SF-2016-64")
pairs[[7]] <- c("SL_5AH04_SL-SF-2016-52", "TX_6AH08_SL-SF-2016-58")
pairs[[8]] <- c("FLA_4AG01_FL-SF-2016-104", "FLA_5AA08_FL-SF-2016-104")
pairs[[9]] <- c("TX_5BE06_SL-SF-2016-31", "TX_6AH04_SL-SF-2016-48")
pairs[[10]] <- c("TX_5BE12_SL-SF-2016-37", "TX_5BE11_SL-SF-2016-36")
pairs[[11]] <- c("TX_6AH07_SL-SF-2016-53", "TX_4AB10_SL-SF-2016-56")
pairs[[12]] <- c("TX_4BD05_SL-SF-2016-07", "TX_4BD03_SL-SF-2016-05")
pairs[[13]] <- c("TX_6AH03_SL-SF-2016-47", "TX_5BE08_SL-SF-2016-33")
pairs[[14]] <- c("TX_5BE10_SL-SF-2016-35", "TX_6AG09_SL-SF-2016-41")
pairs[[15]] <- c("TX_6AH05_SL-SF-2016-49", "TX_5BE05_SL-SF-2016-30")
pairs[[16]] <- c("TX_6AH01_SL-SF-2016-45", "TX_5BF01_SL-SF-2016-38")
pairs[[17]] <- c("SL_5AG09_SL-SF-2016-51", "TX_6AH06_SL-SF-2016-50")
pairs[[18]] <- c("FL_4AC06_FL-SF-2016-45", "FL_5BF07_FL-SF-2016-45")
pairs[[19]] <- c("FL_6AB02_FL-SF-2016-14", "FL_4AG03_FL-SF-2016-14")
pairs[[20]] <- c("SL_5AH06_SL-SF-2016-54", "TX_4AB10_SL-SF-2016-56")
pairs[[21]] <- c("SL_5AH06_SL-SF-2016-54", "TX_6AH07_SL-SF-2016-53")
pairs[[22]] <- c("FL_6AB01_FL-SF-2016-08", "FL_4AC04_FL-SF-2016-08")
pairs[[23]] <- c("MS_4AA06_MS-SF-2015-58", "MS_5BH06_MS-SF-2015-44")
pairs[[24]] <- c("TX_6AH02_SL-SF-2016-46", "TX_5BE07_SL-SF-2016-32")
pairs[[25]] <- c("SL_5AH07_SL-SF-2016-55", "TX_4AB11_SL-SF-2016-57")
pairs[[26]] <- c("TX_4AC01_SA-SF-2016-13", "TX_5BB09_SA-SF-2016-13")
# create empty list for genotype error (discordant loci)
genoerror <- list()
# create empty list for duplicates (retains first indv in pair)
dup <- character()
# identify discordant genotypes for each set of duplicates
for (p in pairs){
# select duplicates from genotype matrix
geno <- df %>%
filter(rowname %in% p) %>%
select(-rowname)
# compare genotypes
comp <- (t(geno))
contigs <- as.data.frame(comp) %>%
rownames_to_column(var = "LOCUS") %>%
mutate(V1 = as.character(V1),
V2 = as.character(V2)) %>%
filter(V1 != V2)
# write vector with first individual in pair
ind <- p[1]
dup <- c(dup, ind)
genoerror[[ind]] <- contigs
}
# if it throws error object V1 or V2 not found it means one or more of the samples names are not correct
# combine into one dataframe
genoerror <- ldply(genoerror, data.frame) %>%
rename(DUP = `.id`,
INDV1 = V1,
INDV2 = V2)Compare number of discordant genotype calls per pair
total <- as.numeric(length(locNames(gen)))
# compar number of loci by pair
per_ind <- count(genoerror, DUP) %>%
mutate(freq = n/total)
ggplot(per_ind, aes(freq)) +
geom_histogram(binwidth = 0.005, fill = "darkorange", color = "black") +
labs(x = "genotyping error frequency per indv") +
theme_standardIdentify loci consistently affected by genotyping error
total <- as.numeric(length(pairs))
# compare loci affected by genotyping error
per_loc <- count(genoerror, LOCUS) %>%
mutate(freq = n/total) %>%
rename(Locus = LOCUS)
per_loc <- left_join(per_loc, hap_stats)Joining, by = "Locus"
ggplot(per_loc, aes(freq)) +
geom_histogram(binwidth = 0.05, fill = "darkorange", color = "black") +
labs(x = "freq genotyping error per locus") +
theme_standardRemove flagged loci and one individual per pair.
# remove duplicates
removeInd <- dup
gen <- gen.ind.rem.Ind(gen, removeInd)
# remove flagged loci
removeloc <- filter(per_loc, n > 3)
gen <- genind.rem.loci(gen, removeloc)
gen/// GENIND OBJECT /////////
// 317 individuals; 3,676 loci; 35,586 alleles; size: 48.9 Mb
// Basic content
@tab: 317 x 35586 matrix of allele counts
@loc.n.all: number of alleles per locus (range: 1-66)
@loc.fac: locus factor for the 35586 columns of @tab
@all.names: list of allele names for each locus
@ploidy: ploidy of each individual (range: 2-2)
@type: codom
@call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
// Optional content
@pop: population of each individual (group size range: 11-67)
26551 loci were removed as potentially affected by genotyping error.
Generate summary statistics
SampleInfo <- read_delim("data/POPGEN/SampleInfo.txt", delim = "\t") %>%
distinct(SAMPLE_ID, .keep_all = TRUE)
Inds <- as.data.frame(indNames(gen)) %>%
rename(LIB_ID = `indNames(gen)`) %>%
separate(LIB_ID, into = c("POP", "WELL", "SAMPLE_ID"), sep = "_", remove = FALSE) %>%
select(-POP)
strata <- left_join(Inds, SampleInfo) %>%
distinct()
strata(gen) <- strata
# define groups using strata information
setPop(gen) <- ~POP
popNames(gen)[1] "AL" "FL" "LA" "FLA" "TX" "MS" "SC"
# generate genetic diversity stats
GenDiv <- adegenet::summary(gen)
dat <- hierfstat:::.genind2hierfstat(gen)
GenDiv2 <- basic.stats(dat)Compare observed (Ho) and expected (He) heterozygosity for all individuals across all populations.
# observed heterozygosity per locus
Ho <- as.data.frame(GenDiv$Hobs) %>%
rownames_to_column("LOCUS") %>%
rename(Ho = `GenDiv$Hobs`)
# expected heterozygosity per locus
Hs <- as.data.frame(GenDiv$Hexp) %>%
rownames_to_column("LOCUS") %>%
rename(Hexp = `GenDiv$Hexp`)
# expected and observed heterozysity per locus
Het <- left_join(Ho, Hs, by = "LOCUS")Identify loci that are now monomorphic and remove from data set:
monomorphic <- filter(Ho, Ho == 0)
View(monomorphic)
monomorphic <- monomorphic$LOCUS
# remove flagged loci
removeloc <- c(monomorphic)
gen <- genind.rem.loci(gen, removeloc)
gen/// GENIND OBJECT /////////
// 317 individuals; 3,615 loci; 35,459 alleles; size: 48.9 Mb
// Basic content
@tab: 317 x 35459 matrix of allele counts
@loc.n.all: number of alleles per locus (range: 2-66)
@loc.fac: locus factor for the 35459 columns of @tab
@all.names: list of allele names for each locus
@ploidy: ploidy of each individual (range: 2-2)
@type: codom
@call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
// Optional content
@pop: population of each individual (group size range: 20-67)
@strata: a data frame with 29 columns ( LIB_ID, WELL, SAMPLE_ID, POP, REGION, OCEAN, ... )
Expected vs observed heterozygosity across all populations.
temp <- Het %>%
filter(Ho != 0)
# plot Ho vs Hs across all individuals
ggplot(temp, aes(x = Ho, y = Hexp)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed", size = 1) +
xlim(0, 1) +
ylim(0, 1) +
labs(x = "observed heterozygosity Ho", y = "within cluster diversity Hs (Hexp)") +
theme_standardCompare distribution of observed heterozygosity (Ho) in each putative population:
Ho_per <- as.data.frame(GenDiv2$Ho) %>%
rownames_to_column("LOCUS") %>%
rename(AL = `1`, FL = `2`, LA = `3`, FLA = `4`, TX = `5`, MS = `6`, SC = `7`) %>%
select(LOCUS, AL, FL, FLA, LA, MS, SC, TX) %>%
gather(Group, Ho, 2:8)
# compare observed heterozygosity per locus and cluster
ggplot(Ho_per, aes(x = Group, y = Ho)) +
geom_boxplot() +
labs(x = "Group", y = "observed heterozygosity Ho") +
theme_standardCompare distribution of expected heterozygosity He, (within cluster diversity) in each putative population:
# within cluster diversity/exp diversity per locus (rows) and cluster (columns)
Hs_per <- as.data.frame(GenDiv2$Hs) %>%
rownames_to_column("LOCUS") %>%
rename(AL = `1`, FL = `2`, LA = `3`, FLA = `4`, TX = `5`, MS = `6`, SC = `7`) %>%
select(LOCUS, AL, FL, FLA, LA, MS, SC, TX) %>%
gather(Group, Hs, 2:8)
# compare expected heterozygosity per locus and cluster
ggplot(Hs_per, aes(x = Group, y = Hs)) +
geom_boxplot() +
labs(x = "Group", y = "expected heterozygosity Ho") +
theme_standardComparison of oberved vs. heterozygosity within each population:
# join data frames
Het_per <- left_join(Hs_per, Ho_per)
# plot per cluster
ggplot(Het_per, aes(x = Ho, y = Hs)) +
geom_point(shape = 19, size = 1) +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed", size = 1) +
xlim(0, 1) +
ylim(0, 1) +
facet_grid(. ~ Group) +
labs(x = "observed heterozygosity Ho", y = "within cluster diversity Hs (Hexp)") +
theme_standardTest for deviations from Hardy-Weinberg equilibrium within each group (POP level):
setPop(gen) <- ~POP
popNames(gen)
# # should be able to use lapply
# HWE <- seppop(gen) %>%
# lapply(hw.test, B=1000)
#
# HWE[["ALL"]] <- hw.test(gen, B = 1000)
#
# # convert to data.frames
# hwe <- list()
#
# for (m in names(HWE)) {
# hwe[[m]] <- as.data.frame(HWE[[m]]) %>%
# rename(pval = Pr.exact) %>%
# rownames_to_column("LOCUS") %>%
# select(LOCUS, pval)
# }
#
# hwe <- ldply(hwe, data.frame) %>%
# rename(POP = `.id`) %>%
# mutate(HWE = ifelse(pval <= 0.05, "OUT", "IN"))
# write_delim(hwe, "results/HWEbypop", delim = "\t")
hwe <- read_delim("results/HWEbypop", delim = "\t")
hwe_perpop <- hwe %>%
group_by(POP) %>%
count(HWE)
hwe_perloc <- hwe %>%
filter(POP != "ALL") %>%
group_by(LOCUS) %>%
count(HWE) %>%
filter(HWE == "OUT")Format output and view results:
# plot data
ggplot(hwe_perpop, aes(x = HWE, y = n)) +
geom_bar(stat = "identity", color = "black", fill = "darkorange") +
labs(x = "significant deviation from HWE (p > 0.05)", y = "number of loci") +
facet_grid( ~ POP) +
theme_standardDistribution of number of times loci are out of HWE
ggplot(hwe_perloc, aes(x = n)) +
geom_histogram(binwidth = 1, color = "black", fill = "darkorange") +
labs(x = "no populations out of HWE", y = "no. loci") +
theme_standardRemove loci out of HWE in every population:
# remove flagged loci
removeloc <- hwe_perloc %>%
filter(n >= 7)
gen <- genind.rem.loci(gen, removeloc$LOCUS)
gen/// GENIND OBJECT /////////
// 317 individuals; 3,558 loci; 34,527 alleles; size: 47.6 Mb
// Basic content
@tab: 317 x 34527 matrix of allele counts
@loc.n.all: number of alleles per locus (range: 2-66)
@loc.fac: locus factor for the 34527 columns of @tab
@all.names: list of allele names for each locus
@ploidy: ploidy of each individual (range: 2-2)
@type: codom
@call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
// Optional content
@pop: population of each individual (group size range: 20-67)
@strata: a data frame with 29 columns ( LIB_ID, WELL, SAMPLE_ID, POP, REGION, OCEAN, ... )
Write file with filtered data set:
gen/// GENIND OBJECT /////////
// 317 individuals; 3,558 loci; 32,723 alleles; size: 45.1 Mb
// Basic content
@tab: 317 x 32723 matrix of allele counts
@loc.n.all: number of alleles per locus (range: 2-66)
@loc.fac: locus factor for the 32723 columns of @tab
@all.names: list of allele names for each locus
@ploidy: ploidy of each individual (range: 2-2)
@type: codom
@call: radiator::write_genind(data = input)
// Optional content
@pop: population of each individual (group size range: 20-67)
@strata: a data frame with 2 columns ( INDIVIDUALS, POP_ID )
Write files with individuals and Contigs still contained in data set and use to filter vcf file.
keeploci <- as.data.frame(locNames(gen)) %>%
rename(LOCUS = `locNames(gen)`)
keeploci <- filter(loc_stats_F14, CHR %in% keeploci$LOCUS) %>%
select(CHR, POS)
write.table(keeploci, "data/VCF/filtered.loci",
col.names = FALSE, row.names = FALSE, quote = FALSE)
keepind <- as.data.frame(indNames(gen)) %>%
rename(LIB_ID = `indNames(gen)`) %>%
separate(LIB_ID, into = c("temp", "ID"), sep = 3, remove = TRUE) %>%
select(-temp) %>%
separate(ID, into = c("temp", "SAMPLEI_ID"), sep = 6, remove = TRUE) %>%
select(-temp)
keepind <-filter(ind_stats_F14, SAMPLE_ID %in% keepind$SAMPLEI_ID) %>%
select(INDV)
write.table(keepind, "data/VCF/filtered.ind",
col.names = FALSE, row.names = FALSE, quote = FALSE)Write vcf file and 012 format:
vcftools --vcf data/VCF/temp/SFL.F14.recode.vcf --out data/VCF/temp/SFL --positions data/VCF/filtered.loci --keep data/VCF/filtered.ind --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/SFL.recode.vcf --out data/POPGEN/SFL --012
vcftools --vcf data/VCF/temp/SFL.recode.vcf --out data/VCF/SFL.fil --depth
vcftools --vcf data/VCF/temp/SFL.recode.vcf --out data/VCF/SFL.fil --site-mean-depth
vcftools --vcf data/VCF/temp/SFL.recode.vcf --out data/VCF/SFL.fil --missing-indv
vcftools --vcf data/VCF/temp/SFL.recode.vcf --out data/VCF/SFL.fil --missing-site
vcftools --vcf data/VCF/temp/SFL.recode.vcf --out data/VCF/SFL.fil --het
vcftools --vcf data/VCF/temp/SFL.recode.vcf --out data/VCF/SFL.fil --hardy
vcftools --vcf data/VCF/temp/SFL.recode.vcf --out data/VCF/SFL.fil --site-quality
vcftools --vcf data/VCF/temp/SFL.recode.vcf --out data/VCF/SFL.fil --geno-depthCompare stats for final filtered data set:
# load stats files ----
ind_stats_fil <- read.ind.stats(dir = "data/VCF", vcf = "SFL.fil") %>%
separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)
loc_stats_fil <- read.loc.stats(dir = "data/VCF/", vcf = "SFL.fil")
View(ind_stats_fil)
View(hap_ind_stats)
# plot missing data per indv ----
p1 <- ggplot(ind_stats_fil, aes(x = MISS_SFL.fil)) +
geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(MISS_SFL.fil, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 0.5),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "missing data per indv") +
theme_standard
# plot Fis per indv ----
p2 <- ggplot(ind_stats_fil, aes(x = Fis_SFL.fil)) +
geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(Fis_SFL.fil, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 0),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "Fis per indv") +
theme_standard
# plot read depth per indv ----
p3 <- ggplot(ind_stats_fil, aes(x = MEAN_DEPTH_SFL.fil)) +
geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(MEAN_DEPTH_SFL.fil, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 20),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "mean read depth per indv") +
theme_standard
# plot depth vs missing ----
p4 <- ggplot(ind_stats_fil, aes(x = MEAN_DEPTH_SFL.fil, y = MISS_SFL.fil)) +
geom_point() +
geom_vline(aes(xintercept = mean(MEAN_DEPTH_SFL.fil, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 20),
color = "darkblue", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = mean(MISS_SFL.fil, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = 0.5),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "mean depth per indv", y = "% missing data") +
theme_standard
# plot Fis vs missing data per indv ----
p5 <- ggplot(ind_stats_fil, aes(x = Fis_SFL.fil, y = MISS_SFL.fil)) +
geom_point() +
geom_vline(aes(xintercept = mean(Fis_SFL.fil, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 0),
color = "darkblue", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = mean(MISS_SFL.fil, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = 0.5),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "Fis per indv", y = "% missing data") +
theme_standard
# plot Fis vs mean depth per indv ----
p6 <- ggplot(ind_stats_fil, aes(x = Fis_SFL.fil, y = MEAN_DEPTH_SFL.fil)) +
geom_point() +
geom_vline(aes(xintercept = mean(Fis_SFL.fil, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 0),
color = "darkblue", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = mean(MEAN_DEPTH_SFL.fil, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = 20),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "Fis per indv", y = "mean depth per indv") +
theme_standard
# plot distribution missing data per locus ----
p7 <- ggplot(ind_stats_fil, aes(x = MISS_SFL.fil)) +
geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(MISS_SFL.fil, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 0.1),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "% missing data per locus") +
theme_standard
# plot distribution mean read depth ----
p8 <- ggplot(loc_stats_fil, aes(x = MEAN_DEPTH_SFL.fil)) +
geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
geom_vline(aes(xintercept = mean(MEAN_DEPTH_SFL.fil, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 20),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "mean read depth per locus") +
theme_standard
# plot read depth vs missing data ----
p9 <- ggplot(loc_stats_fil, aes(x = MEAN_DEPTH_SFL.fil, y = MISS_SFL.fil)) +
geom_point() +
geom_vline(aes(xintercept = mean(MEAN_DEPTH_SFL.fil, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 20),
color = "darkblue", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = mean(MISS_SFL.fil, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = 0.1),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "mean depth per locus", y = "% missing data") +
theme_standard
# plot no of SNPs per locus ----
p10 <- loc_stats_fil %>%
count(CHR) %>%
ggplot(aes(x = n)) +
geom_histogram(binwidth = 1, color = "black", fill = "darkorange") +
labs(x = "number of SNPs per locus") +
theme_standard
temp <- loc_stats_fil %>%
count(CHR)
# plot number of SNPs per contig vs. mean depth ----
p11 <- left_join(temp, loc_stats_fil) %>%
ggplot() +
geom_point(aes(x = n, y = MEAN_DEPTH_SFL.fil)) +
labs(x = "number of SNPs per contig", y = "mean depth") +
theme_standard
# plot depth vs SNP quality ----
site_qual <- read.table("data/VCF/SFL.fil.lqual",
header = TRUE, stringsAsFactors = FALSE) %>%
mutate(PROB = 10^(-QUAL/10))
temp <- data.frame(loc_stats_fil$MEAN_DEPTH_SFL.fil, site_qual$QUAL) %>%
rename(depth = loc_stats_fil.MEAN_DEPTH_SFL.fil, qual = site_qual.QUAL)
p12 <- ggplot(temp, aes(x = depth, y = qual)) +
geom_point(size = 1) +
geom_vline(aes(xintercept = mean(depth, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 20),
color = "darkblue", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = mean(qual, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
geom_hline(aes(yintercept = 20),
color = "darkblue", linetype = "dashed", size = 1) +
labs(x = "mean depth per locus", y = "SNP quality") +
theme_standard
mfil <- multiplot(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, cols=2)